wing-ops/backend/scripts/hns-import/extract-images.py

171 lines
5.6 KiB
Python

"""물질별 시트에서 메인 카드 이미지(100KB+) 추출.
엑셀 워크시트 → drawing → image 관계 체인을 추적해
각 물질 시트의 핵심 이미지만 out/images/{nameKr}.png 로 저장.
동시에 out/image-map.json 생성 (파일명 ↔ 시트명/국문명 매핑).
"""
from __future__ import annotations
import io
import json
import os
import re
import sys
import zipfile
from pathlib import Path
from xml.etree import ElementTree as ET
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
SCRIPT_DIR = Path(__file__).parent.resolve()
OUT_DIR = SCRIPT_DIR / 'out'
IMG_DIR = OUT_DIR / 'images'
IMG_DIR.mkdir(parents=True, exist_ok=True)
SOURCE_XLSX = Path(os.environ.get(
'HNS_SOURCE_XLSX',
r'C:\Projects\MeterialDB\유해물질 화물적부도 검색툴.xlsm',
))
NS = {
'm': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
}
# 메타 시트(데이터 시트)는 스킵
SKIP_SHEETS = {
'화물적부도 화물코드',
'항구별 코드',
'동의어',
'IBC CODE',
'경계선',
}
# 지침서 번호 시트(115~171) 패턴: 순수 숫자
SKIP_PATTERN = re.compile(r'^\d{3}$')
# 최소 이미지 크기 (주요 카드만 대상, 작은 아이콘 제외)
MIN_IMAGE_SIZE = 50_000 # 50 KB
def safe_filename(name: str) -> str:
name = name.strip().rstrip(',').strip()
name = re.sub(r'[<>:"/\\|?*]', '_', name)
return name
def norm_path(p: str) -> str:
return os.path.normpath(p).replace(os.sep, '/')
def main() -> None:
print(f'[읽기] {SOURCE_XLSX}')
if not SOURCE_XLSX.exists():
raise SystemExit(f'소스 파일 없음: {SOURCE_XLSX}')
image_map: dict[str, dict] = {}
saved = 0
skipped = 0
missing = 0
with zipfile.ZipFile(SOURCE_XLSX) as z:
# 1) workbook.xml → sheet 목록
with z.open('xl/workbook.xml') as f:
wb_root = ET.parse(f).getroot()
sheets = []
for s in wb_root.findall('m:sheets/m:sheet', NS):
sheets.append({
'name': s.get('name'),
'rid': s.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id'),
})
with z.open('xl/_rels/workbook.xml.rels') as f:
rels_root = ET.parse(f).getroot()
rid_target = {r.get('Id'): r.get('Target') for r in rels_root.findall('pr:Relationship', NS)}
for s in sheets:
s['target'] = rid_target.get(s['rid'])
print(f'[시트] 총 {len(sheets)}')
for s in sheets:
name = s['name']
if name in SKIP_SHEETS or SKIP_PATTERN.match(name or ''):
skipped += 1
continue
sheet_file = 'xl/' + s['target']
rels_file = os.path.dirname(sheet_file) + '/_rels/' + os.path.basename(sheet_file) + '.rels'
try:
with z.open(rels_file) as f:
srels = ET.parse(f).getroot()
except KeyError:
missing += 1
continue
# 시트 → drawing
drawing_rel = None
for r in srels.findall('pr:Relationship', NS):
t = r.get('Target') or ''
if 'drawing' in (r.get('Type') or '').lower() and 'drawings/' in t:
drawing_rel = t
break
if not drawing_rel:
missing += 1
continue
drawing_path = norm_path(os.path.join(os.path.dirname(sheet_file), drawing_rel))
drawing_rels_path = os.path.dirname(drawing_path) + '/_rels/' + os.path.basename(drawing_path) + '.rels'
try:
with z.open(drawing_rels_path) as f:
drels = ET.parse(f).getroot()
except KeyError:
missing += 1
continue
# drawing → images
image_paths: list[str] = []
for r in drels.findall('pr:Relationship', NS):
t = r.get('Target') or ''
if 'image' in t.lower():
img_path = norm_path(os.path.join(os.path.dirname(drawing_path), t))
image_paths.append(img_path)
if not image_paths:
missing += 1
continue
# 가장 큰 이미지 선택 (실제 카드 이미지는 100KB+, 아이콘은 수 KB)
sized = [(z.getinfo(p).file_size, p) for p in image_paths]
sized.sort(reverse=True)
largest_size, largest_path = sized[0]
if largest_size < MIN_IMAGE_SIZE:
missing += 1
continue
# 저장
safe = safe_filename(name)
ext = os.path.splitext(largest_path)[1].lower() or '.png'
out_name = f'{safe}{ext}'
out_path = IMG_DIR / out_name
with z.open(largest_path) as fin, open(out_path, 'wb') as fout:
fout.write(fin.read())
image_map[out_name] = {
'sheetName': name,
'nameKr': safe,
'source': largest_path,
'sizeBytes': largest_size,
}
saved += 1
if saved % 25 == 0:
print(f' {saved}개 저장 완료')
print(f'\n[결과] 저장 {saved} / 스킵(메타) {skipped} / 이미지없음 {missing}')
map_path = OUT_DIR / 'image-map.json'
with open(map_path, 'w', encoding='utf-8') as f:
json.dump(image_map, f, ensure_ascii=False, indent=2)
print(f'[완료] 매핑 파일: {map_path}')
if __name__ == '__main__':
main()