"""SCAT PDF 파싱 CLI 도구. 사용법: python run.py # 단일 PDF 파싱 python run.py # 배치 파싱 python run.py --load-json output/ --geocode # JSON에 좌표 추가 python run.py --load-json output/ --save # JSON → DB 저장 python run.py --load-json output/ --save --dry-run # DB 저장 미리보기 """ from __future__ import annotations import argparse import io import json import re import sys from collections import defaultdict from pathlib import Path # Windows cp949 대응 if sys.platform == 'win32': sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') import fitz from pdf_parser import parse_pdf from pdf_parser_b import parse_pdf_b from models import CoastalSection, SensitiveItem OUTPUT_DIR = Path(__file__).parent / 'output' # --------------------------------------------------------------------------- # PDF 형식 감지 # --------------------------------------------------------------------------- def detect_pdf_type(pdf_path: Path) -> str: """PDF 형식 감지. 'A'(해안사전평가정보집) 또는 'B'(방제정보집) 반환.""" doc = fitz.open(str(pdf_path)) for i in range(min(30, doc.page_count)): text = doc[i].get_text('text') if '식별자' in text and '코드명' in text: doc.close() return 'B' doc.close() return 'A' # --------------------------------------------------------------------------- # PDF 파싱 # --------------------------------------------------------------------------- def process_pdf(pdf_path: Path) -> dict: """단일 PDF를 파싱하고 JSON 파일로 저장한다.""" pdf_type = detect_pdf_type(pdf_path) if pdf_type == 'B': result = parse_pdf_b(str(pdf_path)) else: result = parse_pdf(str(pdf_path)) data = result.model_dump() for s in data['sections']: s.pop('photos', None) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) out_path = OUTPUT_DIR / f'{pdf_path.stem}.json' with open(out_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) return { 'file': pdf_path.name, 'output': str(out_path), 'zone_name': result.zone_name, 'jurisdiction': result.jurisdiction, 'total_sections': result.total_sections, 'skipped_pages': result.skipped_pages, } def run_parse(target: Path): """PDF 파싱 실행.""" if target.is_file() and target.suffix.lower() == '.pdf': pdf_files = [target] elif target.is_dir(): pdf_files = sorted(target.glob('*.pdf')) if not pdf_files: print(f'PDF 파일을 찾을 수 없습니다: {target}') sys.exit(1) print(f'{len(pdf_files)}개 PDF 발견\n') else: print(f'유효하지 않은 경로: {target}') sys.exit(1) results = [] for i, pdf in enumerate(pdf_files, 1): pdf_type = detect_pdf_type(pdf) print(f'[{i}/{len(pdf_files)}] {pdf.name} (Type {pdf_type}) 파싱 중...') try: info = process_pdf(pdf) results.append(info) print(f' -> {info["total_sections"]}개 구간 | {info["zone_name"]} | {info["jurisdiction"]}') print(f' -> 저장: {info["output"]}') except Exception as e: print(f' -> 오류: {e}') results.append({'file': pdf.name, 'error': str(e)}) if len(results) > 1: print(f'\n=== 요약 ===') success = [r for r in results if 'error' not in r] failed = [r for r in results if 'error' in r] total_sections = sum(r['total_sections'] for r in success) print(f'성공: {len(success)}개 / 실패: {len(failed)}개 / 총 구간: {total_sections}개') if failed: print(f'실패 파일: {", ".join(r["file"] for r in failed)}') # --------------------------------------------------------------------------- # JSON → DB 저장 # --------------------------------------------------------------------------- def _extract_zone_cd(sect_cd: str) -> str: """sect_cd에서 zone_cd 추출 (영문 접두사). Type A: SSDD-1 → SSDD (하이픈 앞 영문) Type B: BSBB-1-M-E-S-N → BSBB (첫 하이픈 앞 영문) """ m = re.match(r'^([A-Z]{2,})', sect_cd) return m.group(1) if m else sect_cd def _extract_jrsd_short(jurisdiction: str) -> str: """관할 기관명에서 짧은 이름 추출. 예: '보령 해양경비안전서' → '보령'""" if not jurisdiction: return '' return jurisdiction.split()[0] if ' ' in jurisdiction else jurisdiction def _dict_to_section(d: dict) -> CoastalSection: """JSON dict → CoastalSection 모델 변환.""" sensitive = [SensitiveItem(**item) for item in (d.get('sensitive_info') or [])] return CoastalSection( section_number=d.get('section_number', 0), sect_nm=d.get('sect_nm', ''), sect_cd=d.get('sect_cd', ''), esi_cd=d.get('esi_cd'), esi_num=d.get('esi_num'), shore_tp=d.get('shore_tp'), cst_tp_cd=d.get('cst_tp_cd'), len_m=d.get('len_m'), width_m=d.get('width_m'), lat=d.get('lat'), lng=d.get('lng'), access_dc=d.get('access_dc'), access_pt=d.get('access_pt'), sensitive_info=sensitive, cleanup_methods=d.get('cleanup_methods', []), end_criteria=d.get('end_criteria', []), notes=d.get('notes', []), ) def load_json_files(json_dir: Path) -> list[dict]: """JSON 디렉토리에서 모든 파싱 결과를 로드한다.""" all_data = [] for f in sorted(json_dir.glob('*.json')): with open(f, encoding='utf-8') as fp: data = json.load(fp) if data.get('total_sections', 0) > 0: all_data.append(data) return all_data def group_by_zone(all_data: list[dict]) -> dict: """파싱 결과를 zone_cd로 그룹핑한다. Returns: {zone_cd: { 'zone_nm': str, 'jrsd_nm': str, 'sections': [dict, ...] }} """ zones = defaultdict(lambda: {'zone_nm': '', 'jrsd_nm': '', 'sections': []}) for data in all_data: zone_name = data.get('zone_name', '') jrsd_nm = _extract_jrsd_short(data.get('jurisdiction', '')) for sect in data['sections']: zone_cd = _extract_zone_cd(sect['sect_cd']) zone = zones[zone_cd] if not zone['zone_nm']: zone['zone_nm'] = zone_name if not zone['jrsd_nm']: zone['jrsd_nm'] = jrsd_nm zone['sections'].append(sect) return dict(zones) def run_save(json_dir: Path, dry_run: bool = False): """JSON 파싱 결과를 DB에 저장한다.""" all_data = load_json_files(json_dir) if not all_data: print(f'유효한 JSON 파일을 찾을 수 없습니다: {json_dir}') sys.exit(1) zones = group_by_zone(all_data) total_sections = sum(len(z['sections']) for z in zones.values()) print(f'=== DB 저장 {"미리보기" if dry_run else "시작"} ===') print(f'총 {len(zones)}개 zone, {total_sections}개 구간\n') for zone_cd, zone_info in sorted(zones.items()): sect_count = len(zone_info['sections']) print(f' {zone_cd:8s} | {zone_info["zone_nm"]:20s} | {zone_info["jrsd_nm"]:8s} | {sect_count}개 구간') if dry_run: print(f'\n(dry-run 모드 — DB에 저장하지 않음)') return # 실제 DB 저장 from db import ensure_zone, upsert_section, update_zone_sect_count, update_zone_center, close_pool saved_zones = 0 saved_sections = 0 try: for zone_cd, zone_info in sorted(zones.items()): zone_sn = ensure_zone(zone_cd, zone_info['zone_nm'], zone_info['jrsd_nm']) saved_zones += 1 for sect_dict in zone_info['sections']: section = _dict_to_section(sect_dict) upsert_section(zone_sn, section) saved_sections += 1 update_zone_sect_count(zone_sn) update_zone_center(zone_sn) print(f'\n=== 완료 ===') print(f'{saved_zones}개 zone, {saved_sections}개 구간 저장 완료') except Exception as e: print(f'\n오류 발생: {e}') print(f'저장 진행: {saved_zones}개 zone, {saved_sections}개 구간까지 완료') raise finally: close_pool() # --------------------------------------------------------------------------- # Geocoding # --------------------------------------------------------------------------- def run_geocode(json_dir: Path): """JSON 파싱 결과에 좌표를 추가한다.""" from geocoder import geocode_sections, load_cache, save_cache load_cache() json_files = sorted(json_dir.glob('*.json')) json_files = [f for f in json_files if not f.name.startswith('.')] if not json_files: print(f'JSON 파일을 찾을 수 없습니다: {json_dir}') sys.exit(1) print(f'=== Geocoding 시작 ({len(json_files)}개 JSON) ===\n') total_success = 0 total_fail = 0 for i, f in enumerate(json_files, 1): with open(f, encoding='utf-8') as fp: data = json.load(fp) sections = data.get('sections', []) if not sections: continue zone_name = data.get('zone_name', '') print(f'[{i}/{len(json_files)}] {f.name} ({len(sections)}개 구간)...') success, fail = geocode_sections(sections, zone_name) total_success += success total_fail += fail # 좌표가 있는 구간 수 with_coords = sum(1 for s in sections if s.get('lat')) print(f' -> 좌표: {with_coords}/{len(sections)}') # JSON 업데이트 저장 with open(f, 'w', encoding='utf-8') as fp: json.dump(data, fp, ensure_ascii=False, indent=2) save_cache() print(f'\n=== Geocoding 완료 ===') print(f'성공: {total_success} / 실패: {total_fail}') # --------------------------------------------------------------------------- # 이미지 추출 # --------------------------------------------------------------------------- def run_extract_images(target: Path): """PDF에서 해안사진을 추출하여 scat-photos/에 저장.""" from image_extractor import extract_images_from_pdf if target.is_file() and target.suffix.lower() == '.pdf': pdf_files = [target] elif target.is_dir(): pdf_files = sorted(target.glob('*.pdf')) if not pdf_files: print(f'PDF 파일을 찾을 수 없습니다: {target}') sys.exit(1) print(f'{len(pdf_files)}개 PDF 발견\n') else: print(f'유효하지 않은 경로: {target}') sys.exit(1) total = 0 for i, pdf in enumerate(pdf_files, 1): pdf_type = detect_pdf_type(pdf) print(f'[{i}/{len(pdf_files)}] {pdf.name} (Type {pdf_type}) 이미지 추출 중...') try: count = extract_images_from_pdf(pdf, pdf_type=pdf_type) total += count print(f' -> {count}개 이미지 저장') except Exception as e: print(f' -> 오류: {e}') print(f'\n=== 이미지 추출 완료: 총 {total}개 ===') # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description='SCAT PDF 파싱 CLI 도구') parser.add_argument('target', nargs='?', help='PDF 파일 또는 디렉토리 경로') parser.add_argument('--save', action='store_true', help='파싱 결과를 DB에 저장') parser.add_argument('--load-json', type=Path, help='이미 파싱된 JSON 디렉토리에서 로드') parser.add_argument('--geocode', action='store_true', help='JSON에 Kakao Geocoding으로 좌표 추가') parser.add_argument('--extract-images', action='store_true', help='PDF에서 해안사진 추출 → scat-photos/') parser.add_argument('--dry-run', action='store_true', help='DB 저장 미리보기 (실제 저장 안 함)') args = parser.parse_args() # JSON 로드 모드 if args.load_json: if args.geocode: run_geocode(args.load_json) if args.save: run_save(args.load_json, dry_run=args.dry_run) if not args.geocode and not args.save: print('--load-json은 --geocode 또는 --save와 함께 사용해야 합니다.') sys.exit(1) return # PDF 파싱 모드 if not args.target: parser.print_help() sys.exit(1) target = Path(args.target) # 이미지 추출 모드 if args.extract_images: run_extract_images(target) return run_parse(target) # 파싱 후 바로 DB 저장 if args.save: print('\n') run_save(OUTPUT_DIR, dry_run=args.dry_run) if __name__ == '__main__': main()