"""물질별 시트에서 메인 카드 이미지(100KB+) 추출. 엑셀 워크시트 → drawing → image 관계 체인을 추적해 각 물질 시트의 핵심 이미지만 out/images/{nameKr}.png 로 저장. 동시에 out/image-map.json 생성 (파일명 ↔ 시트명/국문명 매핑). """ from __future__ import annotations import io import json import os import re import sys import zipfile from pathlib import Path from xml.etree import ElementTree as ET sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') SCRIPT_DIR = Path(__file__).parent.resolve() OUT_DIR = SCRIPT_DIR / 'out' IMG_DIR = OUT_DIR / 'images' IMG_DIR.mkdir(parents=True, exist_ok=True) SOURCE_XLSX = Path(os.environ.get( 'HNS_SOURCE_XLSX', r'C:\Projects\MeterialDB\유해물질 화물적부도 검색툴.xlsm', )) NS = { 'm': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', 'pr': 'http://schemas.openxmlformats.org/package/2006/relationships', } # 메타 시트(데이터 시트)는 스킵 SKIP_SHEETS = { '화물적부도 화물코드', '항구별 코드', '동의어', 'IBC CODE', '경계선', } # 지침서 번호 시트(115~171) 패턴: 순수 숫자 SKIP_PATTERN = re.compile(r'^\d{3}$') # 최소 이미지 크기 (주요 카드만 대상, 작은 아이콘 제외) MIN_IMAGE_SIZE = 50_000 # 50 KB def safe_filename(name: str) -> str: name = name.strip().rstrip(',').strip() name = re.sub(r'[<>:"/\\|?*]', '_', name) return name def norm_path(p: str) -> str: return os.path.normpath(p).replace(os.sep, '/') def main() -> None: print(f'[읽기] {SOURCE_XLSX}') if not SOURCE_XLSX.exists(): raise SystemExit(f'소스 파일 없음: {SOURCE_XLSX}') image_map: dict[str, dict] = {} saved = 0 skipped = 0 missing = 0 with zipfile.ZipFile(SOURCE_XLSX) as z: # 1) workbook.xml → sheet 목록 with z.open('xl/workbook.xml') as f: wb_root = ET.parse(f).getroot() sheets = [] for s in wb_root.findall('m:sheets/m:sheet', NS): sheets.append({ 'name': s.get('name'), 'rid': s.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'), }) with z.open('xl/_rels/workbook.xml.rels') as f: rels_root = ET.parse(f).getroot() rid_target = {r.get('Id'): r.get('Target') for r in rels_root.findall('pr:Relationship', NS)} for s in sheets: s['target'] = rid_target.get(s['rid']) print(f'[시트] 총 {len(sheets)}개') for s in sheets: name = s['name'] if name in SKIP_SHEETS or SKIP_PATTERN.match(name or ''): skipped += 1 continue sheet_file = 'xl/' + s['target'] rels_file = os.path.dirname(sheet_file) + '/_rels/' + os.path.basename(sheet_file) + '.rels' try: with z.open(rels_file) as f: srels = ET.parse(f).getroot() except KeyError: missing += 1 continue # 시트 → drawing drawing_rel = None for r in srels.findall('pr:Relationship', NS): t = r.get('Target') or '' if 'drawing' in (r.get('Type') or '').lower() and 'drawings/' in t: drawing_rel = t break if not drawing_rel: missing += 1 continue drawing_path = norm_path(os.path.join(os.path.dirname(sheet_file), drawing_rel)) drawing_rels_path = os.path.dirname(drawing_path) + '/_rels/' + os.path.basename(drawing_path) + '.rels' try: with z.open(drawing_rels_path) as f: drels = ET.parse(f).getroot() except KeyError: missing += 1 continue # drawing → images image_paths: list[str] = [] for r in drels.findall('pr:Relationship', NS): t = r.get('Target') or '' if 'image' in t.lower(): img_path = norm_path(os.path.join(os.path.dirname(drawing_path), t)) image_paths.append(img_path) if not image_paths: missing += 1 continue # 가장 큰 이미지 선택 (실제 카드 이미지는 100KB+, 아이콘은 수 KB) sized = [(z.getinfo(p).file_size, p) for p in image_paths] sized.sort(reverse=True) largest_size, largest_path = sized[0] if largest_size < MIN_IMAGE_SIZE: missing += 1 continue # 저장 safe = safe_filename(name) ext = os.path.splitext(largest_path)[1].lower() or '.png' out_name = f'{safe}{ext}' out_path = IMG_DIR / out_name with z.open(largest_path) as fin, open(out_path, 'wb') as fout: fout.write(fin.read()) image_map[out_name] = { 'sheetName': name, 'nameKr': safe, 'source': largest_path, 'sizeBytes': largest_size, } saved += 1 if saved % 25 == 0: print(f' {saved}개 저장 완료') print(f'\n[결과] 저장 {saved} / 스킵(메타) {skipped} / 이미지없음 {missing}') map_path = OUT_DIR / 'image-map.json' with open(map_path, 'w', encoding='utf-8') as f: json.dump(image_map, f, ensure_ascii=False, indent=2) print(f'[완료] 매핑 파일: {map_path}') if __name__ == '__main__': main()