/** * base.json + ocr.json → frontend/src/data/hnsSubstanceData.json * * 매칭 키: 국문명(nameKr) 정규화 비교 (공백/특수문자 제거 후 소문자 비교) * 병합 규칙: Excel 기본 필드 유지, OCR 결과는 빈 필드만 채움 (OCR이 우선이지 않음) * 실제로 물성/위험도 필드는 base.json 에서 대부분 비어있으므로 OCR 값으로 채워짐. */ import { readFileSync, writeFileSync, existsSync } from 'node:fs'; import { resolve, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; const __dirname = dirname(fileURLToPath(import.meta.url)); const OUT_DIR = resolve(__dirname, 'out'); const BASE_PATH = resolve(OUT_DIR, 'base.json'); const OCR_PATH = resolve(OUT_DIR, 'ocr.json'); const TARGET_PATH = resolve(__dirname, '../../../frontend/src/data/hnsSubstanceData.json'); function normalizeName(s: string | undefined): string { if (!s) return ''; return s .replace(/\s+/g, '') .replace(/[,.·/\-_()[\]]/g, '') .toLowerCase(); } interface NfpaBlock { health: number; fire: number; reactivity: number; special: string; } interface MsdsBlock { hazard: string; firstAid: string; fireFighting: string; spillResponse: string; exposure: string; regulation: string; } interface BaseRecord { id: number; abbreviation: string; nameKr: string; nameEn: string; synonymsEn: string; synonymsKr: string; unNumber: string; casNumber: string; transportMethod: string; sebc: string; usage: string; state: string; color: string; odor: string; flashPoint: string; autoIgnition: string; boilingPoint: string; density: string; solubility: string; vaporPressure: string; vaporDensity: string; explosionRange: string; nfpa: NfpaBlock; hazardClass: string; ergNumber: string; idlh: string; aegl2: string; erpg2: string; responseDistanceFire: string; responseDistanceSpillDay: string; responseDistanceSpillNight: string; marineResponse: string; ppeClose: string; ppeFar: string; msds: MsdsBlock; ibcHazard: string; ibcShipType: string; ibcTankType: string; ibcDetection: string; ibcFireFighting: string; ibcMinRequirement: string; emsCode: string; emsFire: string; emsSpill: string; emsFirstAid: string; cargoCodes: Array<{ code: string; name: string; company: string; source: string }>; portFrequency: Array<{ port: string; portCode: string; lastImport: string; frequency: string }>; } interface OcrResult { [key: string]: unknown; } function firstString(...values: Array): string { for (const v of values) { if (typeof v === 'string' && v.trim().length > 0) return v.trim(); } return ''; } function pickNfpa(ocr: OcrResult): NfpaBlock | null { const n = ocr.nfpa as Partial | undefined; if (!n || typeof n !== 'object') return null; const h = Number(n.health); const f = Number(n.fire); const r = Number(n.reactivity); if ([h, f, r].some((x) => !Number.isFinite(x))) return null; return { health: h, fire: f, reactivity: r, special: typeof n.special === 'string' ? n.special : '', }; } function pickMsds(ocr: OcrResult, base: MsdsBlock): MsdsBlock { const m = (ocr.msds ?? {}) as Partial; return { hazard: firstString(base.hazard, m.hazard), firstAid: firstString(base.firstAid, m.firstAid), fireFighting: firstString(base.fireFighting, m.fireFighting), spillResponse: firstString(base.spillResponse, m.spillResponse), exposure: firstString(base.exposure, m.exposure), regulation: firstString(base.regulation, m.regulation), }; } function merge(base: BaseRecord, ocr: OcrResult | undefined): BaseRecord { if (!ocr) return base; const nfpaFromOcr = pickNfpa(ocr); return { ...base, transportMethod: firstString(base.transportMethod, ocr.transportMethod), sebc: firstString(base.sebc, ocr.sebc), state: firstString(base.state, ocr.state), color: firstString(base.color, ocr.color), odor: firstString(base.odor, ocr.odor), flashPoint: firstString(base.flashPoint, ocr.flashPoint), autoIgnition: firstString(base.autoIgnition, ocr.autoIgnition), boilingPoint: firstString(base.boilingPoint, ocr.boilingPoint), density: firstString(base.density, ocr.density), solubility: firstString(base.solubility, ocr.solubility), vaporPressure: firstString(base.vaporPressure, ocr.vaporPressure), vaporDensity: firstString(base.vaporDensity, ocr.vaporDensity), explosionRange: firstString(base.explosionRange, ocr.explosionRange), nfpa: nfpaFromOcr ?? base.nfpa, hazardClass: firstString(base.hazardClass, ocr.hazardClass), ergNumber: firstString(base.ergNumber, ocr.ergNumber), idlh: firstString(base.idlh, ocr.idlh), aegl2: firstString(base.aegl2, ocr.aegl2), erpg2: firstString(base.erpg2, ocr.erpg2), responseDistanceFire: firstString(base.responseDistanceFire, ocr.responseDistanceFire), responseDistanceSpillDay: firstString(base.responseDistanceSpillDay, ocr.responseDistanceSpillDay), responseDistanceSpillNight: firstString(base.responseDistanceSpillNight, ocr.responseDistanceSpillNight), marineResponse: firstString(base.marineResponse, ocr.marineResponse), ppeClose: firstString(base.ppeClose, ocr.ppeClose), ppeFar: firstString(base.ppeFar, ocr.ppeFar), msds: pickMsds(ocr, base.msds), emsCode: firstString(base.emsCode, ocr.emsCode), emsFire: firstString(base.emsFire, ocr.emsFire), emsSpill: firstString(base.emsSpill, ocr.emsSpill), emsFirstAid: firstString(base.emsFirstAid, ocr.emsFirstAid), }; } function main() { if (!existsSync(BASE_PATH)) { console.error(`base.json 없음: ${BASE_PATH}`); console.error('→ extract-excel.py 를 먼저 실행하세요.'); process.exit(1); } if (!existsSync(OCR_PATH)) { console.warn(`ocr.json 없음: ${OCR_PATH} — 상세 데이터 없이 base 만 사용`); } const base: BaseRecord[] = JSON.parse(readFileSync(BASE_PATH, 'utf-8')); const ocr: Record = existsSync(OCR_PATH) ? JSON.parse(readFileSync(OCR_PATH, 'utf-8')) : {}; console.log(`[입력] base ${base.length}종, ocr ${Object.keys(ocr).length}종`); // OCR 키를 정규화 인덱스로 변환 (정규화키 → OcrResult, 역매핑 normKey → 원본키) const ocrIndex = new Map(); const normToOrig = new Map(); for (const [key, value] of Object.entries(ocr)) { const normKey = normalizeName(key); if (normKey) { ocrIndex.set(normKey, value); normToOrig.set(normKey, key); } } let matched = 0; let matchedBySynonym = 0; const unmatched: string[] = []; const merged = base.map((record) => { // 1단계: nameKr 정규화 매칭 const key = normalizeName(record.nameKr); const ocrResult = ocrIndex.get(key); if (ocrResult) { matched++; ocrIndex.delete(key); return merge(record, ocrResult); } // 2단계: synonymsKr 동의어 매칭 (" / " 구분자) if (record.synonymsKr) { const synonyms = record.synonymsKr.split(' / '); for (const syn of synonyms) { const normSyn = normalizeName(syn); if (!normSyn) continue; const synOcrResult = ocrIndex.get(normSyn); if (synOcrResult) { matched++; matchedBySynonym++; ocrIndex.delete(normSyn); return merge(record, synOcrResult); } } } return record; }); // 남은 OCR 키는 base에 매칭 실패한 항목 (원본 키로 복원) for (const normKey of ocrIndex.keys()) { unmatched.push(normToOrig.get(normKey) ?? normKey); } console.log(`[병합] base ↔ ocr 매칭 ${matched}종 (nameKr: ${matched - matchedBySynonym}, 동의어: ${matchedBySynonym})`); if (unmatched.length > 0) { const unmatchedPath = resolve(OUT_DIR, 'merge-unmatched.json'); writeFileSync(unmatchedPath, JSON.stringify({ count: unmatched.length, keys: unmatched.sort() }, null, 2), 'utf-8'); console.warn(`[경고] OCR 매칭 실패 ${unmatched.length}개 → ${unmatchedPath}`); unmatched.slice(0, 20).forEach((k) => console.warn(` - ${k}`)); if (unmatched.length > 20) console.warn(` ... +${unmatched.length - 20}`); } writeFileSync(TARGET_PATH, JSON.stringify(merged, null, 2), 'utf-8'); const sizeKb = (JSON.stringify(merged).length / 1024).toFixed(0); console.log(`[완료] ${TARGET_PATH} (${sizeKb} KB, ${merged.length}종)`); console.log(` 상세 정보 보유: ${merged.filter((r) => r.flashPoint).length}종`); } main();