/** * base.json + pdf-data.json + ocr.json → frontend/src/data/hnsSubstanceData.json * * 우선순위: pdf-data (PDF 텍스트 추출, 최고 정확도) > base.json > ocr.json (이미지 OCR, 낮은 정확도) * 매칭 키 순서: * 1. CAS 번호 (가장 신뢰할 수 있는 식별자) * 2. 국문명(nameKr) 정규화 비교 * 3. 동의어(synonymsKr) 정규화 비교 */ import { readFileSync, writeFileSync, existsSync } from 'node:fs'; import { resolve, dirname } from 'node:path'; import { fileURLToPath } from 'node:url'; const __dirname = dirname(fileURLToPath(import.meta.url)); const OUT_DIR = resolve(__dirname, 'out'); const BASE_PATH = resolve(OUT_DIR, 'base.json'); const PDF_PATH = resolve(OUT_DIR, 'pdf-data.json'); const OCR_PATH = resolve(OUT_DIR, 'ocr.json'); const TARGET_PATH = resolve(__dirname, '../../../frontend/src/data/hnsSubstanceData.json'); function normalizeName(s: string | undefined): string { if (!s) return ''; return s .replace(/\s+/g, '') .replace(/[,.·/\-_()[\]()]/g, '') .toLowerCase(); } function normalizeCas(s: string | undefined): string { if (!s) return ''; // 앞자리 0 제거 후 정규화 return s .replace(/[^0-9\-]/g, '') .replace(/^0+/, '') .trim(); } interface NfpaBlock { health: number; fire: number; reactivity: number; special: string; } interface MsdsBlock { hazard: string; firstAid: string; fireFighting: string; spillResponse: string; exposure: string; regulation: string; } interface BaseRecord { id: number; abbreviation: string; nameKr: string; nameEn: string; synonymsEn: string; synonymsKr: string; unNumber: string; casNumber: string; transportMethod: string; sebc: string; usage: string; state: string; color: string; odor: string; flashPoint: string; autoIgnition: string; boilingPoint: string; density: string; solubility: string; vaporPressure: string; vaporDensity: string; explosionRange: string; nfpa: NfpaBlock; hazardClass: string; ergNumber: string; idlh: string; aegl2: string; erpg2: string; responseDistanceFire: string; responseDistanceSpillDay: string; responseDistanceSpillNight: string; marineResponse: string; ppeClose: string; ppeFar: string; msds: MsdsBlock; ibcHazard: string; ibcShipType: string; ibcTankType: string; ibcDetection: string; ibcFireFighting: string; ibcMinRequirement: string; emsCode: string; emsFire: string; emsSpill: string; emsFirstAid: string; cargoCodes: Array<{ code: string; name: string; company: string; source: string }>; portFrequency: Array<{ port: string; portCode: string; lastImport: string; frequency: string }>; } interface PdfResult { [key: string]: unknown; casNumber?: string; nameKr?: string; nfpa?: Partial; msds?: Partial; } interface OcrResult { [key: string]: unknown; } function firstString(...values: Array): string { for (const v of values) { if (typeof v === 'string' && v.trim().length > 0) return v.trim(); } return ''; } function pickNfpa(source: PdfResult | OcrResult): NfpaBlock | null { const n = source.nfpa as Partial | undefined; if (!n || typeof n !== 'object') return null; const h = Number(n.health); const f = Number(n.fire); const r = Number(n.reactivity); if ([h, f, r].some((x) => !Number.isFinite(x))) return null; return { health: h, fire: f, reactivity: r, special: typeof n.special === 'string' ? n.special : '', }; } function pickMsds( pdf: PdfResult | undefined, ocr: OcrResult | undefined, base: MsdsBlock, ): MsdsBlock { const p = (pdf?.msds ?? {}) as Partial; const o = (ocr?.msds ?? {}) as Partial; return { hazard: firstString(base.hazard, p.hazard, o.hazard), firstAid: firstString(base.firstAid, p.firstAid, o.firstAid), fireFighting: firstString(base.fireFighting, p.fireFighting, o.fireFighting), spillResponse: firstString(base.spillResponse, p.spillResponse, o.spillResponse), exposure: firstString(base.exposure, p.exposure, o.exposure), regulation: firstString(base.regulation, p.regulation, o.regulation), }; } function merge( base: BaseRecord, pdf: PdfResult | undefined, ocr: OcrResult | undefined, ): BaseRecord { const nfpaFromPdf = pdf ? pickNfpa(pdf) : null; const nfpaFromOcr = ocr ? pickNfpa(ocr) : null; // pdf NFPA 우선, 없으면 ocr, 없으면 base const nfpa = nfpaFromPdf ?? nfpaFromOcr ?? base.nfpa; return { ...base, // pdf > base > ocr 우선순위 unNumber: firstString(pdf?.unNumber, base.unNumber, ocr?.unNumber), casNumber: firstString(pdf?.casNumber, base.casNumber, ocr?.casNumber), synonymsKr: firstString(pdf?.synonymsKr, base.synonymsKr, ocr?.synonymsKr), transportMethod: firstString(base.transportMethod, pdf?.transportMethod, ocr?.transportMethod), sebc: firstString(base.sebc, pdf?.sebc, ocr?.sebc), usage: firstString(pdf?.usage, base.usage, ocr?.usage), state: firstString(pdf?.state, base.state, ocr?.state), color: firstString(pdf?.color, base.color, ocr?.color), odor: firstString(pdf?.odor, base.odor, ocr?.odor), flashPoint: firstString(pdf?.flashPoint, base.flashPoint, ocr?.flashPoint), autoIgnition: firstString(pdf?.autoIgnition, base.autoIgnition, ocr?.autoIgnition), boilingPoint: firstString(pdf?.boilingPoint, base.boilingPoint, ocr?.boilingPoint), density: firstString(pdf?.density, base.density, ocr?.density), solubility: firstString(pdf?.solubility, base.solubility, ocr?.solubility), vaporPressure: firstString(pdf?.vaporPressure, base.vaporPressure, ocr?.vaporPressure), vaporDensity: firstString(pdf?.vaporDensity, base.vaporDensity, ocr?.vaporDensity), explosionRange: firstString(pdf?.explosionRange, base.explosionRange, ocr?.explosionRange), nfpa, hazardClass: firstString(pdf?.hazardClass, base.hazardClass, ocr?.hazardClass), ergNumber: firstString(base.ergNumber, pdf?.ergNumber, ocr?.ergNumber), idlh: firstString(pdf?.idlh, base.idlh, ocr?.idlh), aegl2: firstString(base.aegl2, pdf?.aegl2, ocr?.aegl2), erpg2: firstString(base.erpg2, pdf?.erpg2, ocr?.erpg2), responseDistanceFire: firstString(pdf?.responseDistanceFire, base.responseDistanceFire, ocr?.responseDistanceFire), responseDistanceSpillDay: firstString(pdf?.responseDistanceSpillDay, base.responseDistanceSpillDay, ocr?.responseDistanceSpillDay), responseDistanceSpillNight: firstString(pdf?.responseDistanceSpillNight, base.responseDistanceSpillNight, ocr?.responseDistanceSpillNight), marineResponse: firstString(pdf?.marineResponse, base.marineResponse, ocr?.marineResponse), ppeClose: firstString(base.ppeClose, pdf?.ppeClose, ocr?.ppeClose), ppeFar: firstString(base.ppeFar, pdf?.ppeFar, ocr?.ppeFar), msds: pickMsds(pdf, ocr, base.msds), emsCode: firstString(base.emsCode, pdf?.emsCode, ocr?.emsCode), emsFire: firstString(base.emsFire, pdf?.emsFire, ocr?.emsFire), emsSpill: firstString(base.emsSpill, pdf?.emsSpill, ocr?.emsSpill), emsFirstAid: firstString(base.emsFirstAid, pdf?.emsFirstAid, ocr?.emsFirstAid), }; } function main() { if (!existsSync(BASE_PATH)) { console.error(`base.json 없음: ${BASE_PATH}`); console.error('→ extract-excel.py 를 먼저 실행하세요.'); process.exit(1); } const base: BaseRecord[] = JSON.parse(readFileSync(BASE_PATH, 'utf-8')); // PDF 데이터 로드 const pdfRaw: Record = existsSync(PDF_PATH) ? JSON.parse(readFileSync(PDF_PATH, 'utf-8')) : {}; // OCR 데이터 로드 const ocr: Record = existsSync(OCR_PATH) ? JSON.parse(readFileSync(OCR_PATH, 'utf-8')) : {}; console.log( `[입력] base ${base.length}종, pdf ${Object.keys(pdfRaw).length}종, ocr ${Object.keys(ocr).length}종`, ); // ── PDF 인덱스 구축 ───────────────────────────────────────────────── // 1) nameKr 정규화 인덱스 const pdfByName = new Map(); // 2) CAS 번호 인덱스 const pdfByCas = new Map(); for (const [key, value] of Object.entries(pdfRaw)) { const normKey = normalizeName(key); if (normKey) pdfByName.set(normKey, value); const cas = normalizeCas(value.casNumber); if (cas) { if (!pdfByCas.has(cas)) pdfByCas.set(cas, value); } } // ── OCR 인덱스 구축 ───────────────────────────────────────────────── const ocrByName = new Map(); const ocrNormToOrig = new Map(); for (const [key, value] of Object.entries(ocr)) { const normKey = normalizeName(key); if (normKey) { ocrByName.set(normKey, value); ocrNormToOrig.set(normKey, key); } } // ── 병합 ────────────────────────────────────────────────────────── let pdfMatchedByName = 0; let pdfMatchedByCas = 0; let pdfMatchedBySynonym = 0; let ocrMatched = 0; const pdfUnmatched = new Set(Object.keys(pdfRaw)); const ocrUnmatched = new Set(ocrByName.keys()); const merged = base.map((record) => { let pdfResult: PdfResult | undefined; let ocrResult: OcrResult | undefined; // ── PDF 매칭 ──────────────────────────────────────────────────── // 1. CAS 번호 매칭 (가장 정확) const baseCas = normalizeCas(record.casNumber); if (baseCas) { pdfResult = pdfByCas.get(baseCas); if (pdfResult) { pdfMatchedByCas++; const origKey = pdfResult.nameKr as string | undefined; if (origKey) pdfUnmatched.delete(origKey); } } // 2. nameKr 정규화 매칭 if (!pdfResult) { const normKr = normalizeName(record.nameKr); pdfResult = pdfByName.get(normKr); if (pdfResult) { pdfMatchedByName++; const origKey = pdfResult.nameKr as string | undefined; if (origKey) pdfUnmatched.delete(origKey); } } // 3. synonymsKr 동의어 매칭 if (!pdfResult && record.synonymsKr) { const synonyms = record.synonymsKr.split(' / '); for (const syn of synonyms) { const normSyn = normalizeName(syn); if (!normSyn) continue; pdfResult = pdfByName.get(normSyn); if (pdfResult) { pdfMatchedBySynonym++; const origKey = pdfResult.nameKr as string | undefined; if (origKey) pdfUnmatched.delete(origKey); break; } } } // ── OCR 매칭 (PDF 없는 경우 보조) ──────────────────────────────── const normKr = normalizeName(record.nameKr); const ocrByNameResult = ocrByName.get(normKr); if (ocrByNameResult) { ocrResult = ocrByNameResult; ocrMatched++; ocrUnmatched.delete(normKr); } if (!ocrResult && record.synonymsKr) { const synonyms = record.synonymsKr.split(' / '); for (const syn of synonyms) { const normSyn = normalizeName(syn); if (!normSyn) continue; const synOcrResult = ocrByName.get(normSyn); if (synOcrResult) { ocrResult = synOcrResult; ocrMatched++; ocrUnmatched.delete(normSyn); break; } } } return merge(record, pdfResult, ocrResult); }); // ── 통계 출력 ────────────────────────────────────────────────────── const pdfTotal = pdfMatchedByCas + pdfMatchedByName + pdfMatchedBySynonym; console.log( `[PDF 매칭] 총 ${pdfTotal}종 (CAS: ${pdfMatchedByCas}, 국문명: ${pdfMatchedByName}, 동의어: ${pdfMatchedBySynonym})`, ); console.log(`[OCR 매칭] ${ocrMatched}종`); if (pdfUnmatched.size > 0) { const unmatchedList = Array.from(pdfUnmatched).sort(); const unmatchedPath = resolve(OUT_DIR, 'pdf-unmatched.json'); writeFileSync( unmatchedPath, JSON.stringify({ count: unmatchedList.length, keys: unmatchedList }, null, 2), 'utf-8', ); console.warn( `[경고] PDF 매칭 실패 ${unmatchedList.length}개 → ${unmatchedPath}`, ); unmatchedList.slice(0, 10).forEach((k) => console.warn(` - ${k}`)); if (unmatchedList.length > 10) console.warn(` ... +${unmatchedList.length - 10}`); } writeFileSync(TARGET_PATH, JSON.stringify(merged, null, 2), 'utf-8'); const sizeKb = (JSON.stringify(merged).length / 1024).toFixed(0); console.log(`[완료] ${TARGET_PATH} (${sizeKb} KB, ${merged.length}종)`); console.log(` 상세 정보 보유: ${merged.filter((r) => r.flashPoint).length}종`); console.log(` NFPA 있음: ${merged.filter((r) => r.nfpa.health || r.nfpa.fire || r.nfpa.reactivity).length}종`); } main();