wing-ops/backend/scripts/hns-import/extract-images.py

"""물질별 시트에서 메인 카드 이미지(100KB+) 추출.

엑셀 워크시트 → drawing → image 관계 체인을 추적해
각 물질 시트의 핵심 이미지만 out/images/{nameKr}.png 로 저장.

동시에 out/image-map.json 생성 (파일명 ↔ 시트명/국문명 매핑).
"""
from __future__ import annotations

import io
import json
import os
import re
import sys
import zipfile
from pathlib import Path
from xml.etree import ElementTree as ET

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

SCRIPT_DIR = Path(__file__).parent.resolve()
OUT_DIR = SCRIPT_DIR / 'out'
IMG_DIR = OUT_DIR / 'images'
IMG_DIR.mkdir(parents=True, exist_ok=True)

SOURCE_XLSX = Path(os.environ.get(
    'HNS_SOURCE_XLSX',
    r'C:\Projects\MeterialDB\유해물질 화물적부도 검색툴.xlsm',
))

NS = {
    'm': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
}

# 메타 시트(데이터 시트)는 스킵
SKIP_SHEETS = {
    '화물적부도 화물코드',
    '항구별 코드',
    '동의어',
    'IBC CODE',
    '경계선',
}
# 지침서 번호 시트(115~171) 패턴: 순수 숫자
SKIP_PATTERN = re.compile(r'^\d{3}$')

# 최소 이미지 크기 (주요 카드만 대상, 작은 아이콘 제외)
MIN_IMAGE_SIZE = 50_000  # 50 KB


def safe_filename(name: str) -> str:
    name = name.strip().rstrip(',').strip()
    name = re.sub(r'[<>:"/\\|?*]', '_', name)
    return name


def norm_path(p: str) -> str:
    return os.path.normpath(p).replace(os.sep, '/')


def main() -> None:
    print(f'[읽기] {SOURCE_XLSX}')
    if not SOURCE_XLSX.exists():
        raise SystemExit(f'소스 파일 없음: {SOURCE_XLSX}')

    image_map: dict[str, dict] = {}
    saved = 0
    skipped = 0
    missing = 0

    with zipfile.ZipFile(SOURCE_XLSX) as z:
        # 1) workbook.xml → sheet 목록
        with z.open('xl/workbook.xml') as f:
            wb_root = ET.parse(f).getroot()
        sheets = []
        for s in wb_root.findall('m:sheets/m:sheet', NS):
            sheets.append({
                'name': s.get('name'),
                'rid': s.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'),
            })
        with z.open('xl/_rels/workbook.xml.rels') as f:
            rels_root = ET.parse(f).getroot()
        rid_target = {r.get('Id'): r.get('Target') for r in rels_root.findall('pr:Relationship', NS)}
        for s in sheets:
            s['target'] = rid_target.get(s['rid'])

        print(f'[시트] 총 {len(sheets)}개')

        for s in sheets:
            name = s['name']
            if name in SKIP_SHEETS or SKIP_PATTERN.match(name or ''):
                skipped += 1
                continue

            sheet_file = 'xl/' + s['target']
            rels_file = os.path.dirname(sheet_file) + '/_rels/' + os.path.basename(sheet_file) + '.rels'
            try:
                with z.open(rels_file) as f:
                    srels = ET.parse(f).getroot()
            except KeyError:
                missing += 1
                continue

            # 시트 → drawing
            drawing_rel = None
            for r in srels.findall('pr:Relationship', NS):
                t = r.get('Target') or ''
                if 'drawing' in (r.get('Type') or '').lower() and 'drawings/' in t:
                    drawing_rel = t
                    break
            if not drawing_rel:
                missing += 1
                continue

            drawing_path = norm_path(os.path.join(os.path.dirname(sheet_file), drawing_rel))
            drawing_rels_path = os.path.dirname(drawing_path) + '/_rels/' + os.path.basename(drawing_path) + '.rels'
            try:
                with z.open(drawing_rels_path) as f:
                    drels = ET.parse(f).getroot()
            except KeyError:
                missing += 1
                continue

            # drawing → images
            image_paths: list[str] = []
            for r in drels.findall('pr:Relationship', NS):
                t = r.get('Target') or ''
                if 'image' in t.lower():
                    img_path = norm_path(os.path.join(os.path.dirname(drawing_path), t))
                    image_paths.append(img_path)
            if not image_paths:
                missing += 1
                continue

            # 가장 큰 이미지 선택 (실제 카드 이미지는 100KB+, 아이콘은 수 KB)
            sized = [(z.getinfo(p).file_size, p) for p in image_paths]
            sized.sort(reverse=True)
            largest_size, largest_path = sized[0]
            if largest_size < MIN_IMAGE_SIZE:
                missing += 1
                continue

            # 저장
            safe = safe_filename(name)
            ext = os.path.splitext(largest_path)[1].lower() or '.png'
            out_name = f'{safe}{ext}'
            out_path = IMG_DIR / out_name
            with z.open(largest_path) as fin, open(out_path, 'wb') as fout:
                fout.write(fin.read())
            image_map[out_name] = {
                'sheetName': name,
                'nameKr': safe,
                'source': largest_path,
                'sizeBytes': largest_size,
            }
            saved += 1
            if saved % 25 == 0:
                print(f'  {saved}개 저장 완료')

    print(f'\n[결과] 저장 {saved} / 스킵(메타) {skipped} / 이미지없음 {missing}')

    map_path = OUT_DIR / 'image-map.json'
    with open(map_path, 'w', encoding='utf-8') as f:
        json.dump(image_map, f, ensure_ascii=False, indent=2)
    print(f'[완료] 매핑 파일: {map_path}')


if __name__ == '__main__':
    main()