wing-ops/backend/scripts/hns-import/merge-data.ts

363 lines
13 KiB
TypeScript
Raw Blame 히스토리

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* base.json + pdf-data.json + ocr.json → frontend/src/data/hnsSubstanceData.json
*
* 우선순위: pdf-data (PDF 텍스트 추출, 최고 정확도) > base.json > ocr.json (이미지 OCR, 낮은 정확도)
* 매칭 키 순서:
* 1. CAS 번호 (가장 신뢰할 수 있는 식별자)
* 2. 국문명(nameKr) 정규화 비교
* 3. 동의어(synonymsKr) 정규화 비교
*/
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
import { resolve, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const OUT_DIR = resolve(__dirname, 'out');
const BASE_PATH = resolve(OUT_DIR, 'base.json');
const PDF_PATH = resolve(OUT_DIR, 'pdf-data.json');
const OCR_PATH = resolve(OUT_DIR, 'ocr.json');
const TARGET_PATH = resolve(__dirname, '../../../frontend/src/data/hnsSubstanceData.json');
function normalizeName(s: string | undefined): string {
if (!s) return '';
return s
.replace(/\s+/g, '')
.replace(/[,.·/\-_()[\]]/g, '')
.toLowerCase();
}
function normalizeCas(s: string | undefined): string {
if (!s) return '';
// 앞자리 0 제거 후 정규화
return s
.replace(/[^0-9\-]/g, '')
.replace(/^0+/, '')
.trim();
}
interface NfpaBlock {
health: number;
fire: number;
reactivity: number;
special: string;
}
interface MsdsBlock {
hazard: string;
firstAid: string;
fireFighting: string;
spillResponse: string;
exposure: string;
regulation: string;
}
interface BaseRecord {
id: number;
abbreviation: string;
nameKr: string;
nameEn: string;
synonymsEn: string;
synonymsKr: string;
unNumber: string;
casNumber: string;
transportMethod: string;
sebc: string;
usage: string;
state: string;
color: string;
odor: string;
flashPoint: string;
autoIgnition: string;
boilingPoint: string;
density: string;
solubility: string;
vaporPressure: string;
vaporDensity: string;
explosionRange: string;
nfpa: NfpaBlock;
hazardClass: string;
ergNumber: string;
idlh: string;
aegl2: string;
erpg2: string;
responseDistanceFire: string;
responseDistanceSpillDay: string;
responseDistanceSpillNight: string;
marineResponse: string;
ppeClose: string;
ppeFar: string;
msds: MsdsBlock;
ibcHazard: string;
ibcShipType: string;
ibcTankType: string;
ibcDetection: string;
ibcFireFighting: string;
ibcMinRequirement: string;
emsCode: string;
emsFire: string;
emsSpill: string;
emsFirstAid: string;
cargoCodes: Array<{ code: string; name: string; company: string; source: string }>;
portFrequency: Array<{ port: string; portCode: string; lastImport: string; frequency: string }>;
}
interface PdfResult {
[key: string]: unknown;
casNumber?: string;
nameKr?: string;
nfpa?: Partial<NfpaBlock>;
msds?: Partial<MsdsBlock>;
}
interface OcrResult {
[key: string]: unknown;
}
function firstString(...values: Array<unknown>): string {
for (const v of values) {
if (typeof v === 'string' && v.trim().length > 0) return v.trim();
}
return '';
}
function pickNfpa(source: PdfResult | OcrResult): NfpaBlock | null {
const n = source.nfpa as Partial<NfpaBlock> | undefined;
if (!n || typeof n !== 'object') return null;
const h = Number(n.health);
const f = Number(n.fire);
const r = Number(n.reactivity);
if ([h, f, r].some((x) => !Number.isFinite(x))) return null;
return {
health: h,
fire: f,
reactivity: r,
special: typeof n.special === 'string' ? n.special : '',
};
}
function pickMsds(
pdf: PdfResult | undefined,
ocr: OcrResult | undefined,
base: MsdsBlock,
): MsdsBlock {
const p = (pdf?.msds ?? {}) as Partial<MsdsBlock>;
const o = (ocr?.msds ?? {}) as Partial<MsdsBlock>;
return {
hazard: firstString(base.hazard, p.hazard, o.hazard),
firstAid: firstString(base.firstAid, p.firstAid, o.firstAid),
fireFighting: firstString(base.fireFighting, p.fireFighting, o.fireFighting),
spillResponse: firstString(base.spillResponse, p.spillResponse, o.spillResponse),
exposure: firstString(base.exposure, p.exposure, o.exposure),
regulation: firstString(base.regulation, p.regulation, o.regulation),
};
}
function merge(
base: BaseRecord,
pdf: PdfResult | undefined,
ocr: OcrResult | undefined,
): BaseRecord {
const nfpaFromPdf = pdf ? pickNfpa(pdf) : null;
const nfpaFromOcr = ocr ? pickNfpa(ocr) : null;
// pdf NFPA 우선, 없으면 ocr, 없으면 base
const nfpa = nfpaFromPdf ?? nfpaFromOcr ?? base.nfpa;
return {
...base,
// pdf > base > ocr 우선순위
unNumber: firstString(pdf?.unNumber, base.unNumber, ocr?.unNumber),
casNumber: firstString(pdf?.casNumber, base.casNumber, ocr?.casNumber),
synonymsKr: firstString(pdf?.synonymsKr, base.synonymsKr, ocr?.synonymsKr),
transportMethod: firstString(base.transportMethod, pdf?.transportMethod, ocr?.transportMethod),
sebc: firstString(base.sebc, pdf?.sebc, ocr?.sebc),
usage: firstString(pdf?.usage, base.usage, ocr?.usage),
state: firstString(pdf?.state, base.state, ocr?.state),
color: firstString(pdf?.color, base.color, ocr?.color),
odor: firstString(pdf?.odor, base.odor, ocr?.odor),
flashPoint: firstString(pdf?.flashPoint, base.flashPoint, ocr?.flashPoint),
autoIgnition: firstString(pdf?.autoIgnition, base.autoIgnition, ocr?.autoIgnition),
boilingPoint: firstString(pdf?.boilingPoint, base.boilingPoint, ocr?.boilingPoint),
density: firstString(pdf?.density, base.density, ocr?.density),
solubility: firstString(pdf?.solubility, base.solubility, ocr?.solubility),
vaporPressure: firstString(pdf?.vaporPressure, base.vaporPressure, ocr?.vaporPressure),
vaporDensity: firstString(pdf?.vaporDensity, base.vaporDensity, ocr?.vaporDensity),
explosionRange: firstString(pdf?.explosionRange, base.explosionRange, ocr?.explosionRange),
nfpa,
hazardClass: firstString(pdf?.hazardClass, base.hazardClass, ocr?.hazardClass),
ergNumber: firstString(base.ergNumber, pdf?.ergNumber, ocr?.ergNumber),
idlh: firstString(pdf?.idlh, base.idlh, ocr?.idlh),
aegl2: firstString(base.aegl2, pdf?.aegl2, ocr?.aegl2),
erpg2: firstString(base.erpg2, pdf?.erpg2, ocr?.erpg2),
responseDistanceFire: firstString(pdf?.responseDistanceFire, base.responseDistanceFire, ocr?.responseDistanceFire),
responseDistanceSpillDay: firstString(pdf?.responseDistanceSpillDay, base.responseDistanceSpillDay, ocr?.responseDistanceSpillDay),
responseDistanceSpillNight: firstString(pdf?.responseDistanceSpillNight, base.responseDistanceSpillNight, ocr?.responseDistanceSpillNight),
marineResponse: firstString(pdf?.marineResponse, base.marineResponse, ocr?.marineResponse),
ppeClose: firstString(base.ppeClose, pdf?.ppeClose, ocr?.ppeClose),
ppeFar: firstString(base.ppeFar, pdf?.ppeFar, ocr?.ppeFar),
msds: pickMsds(pdf, ocr, base.msds),
emsCode: firstString(base.emsCode, pdf?.emsCode, ocr?.emsCode),
emsFire: firstString(base.emsFire, pdf?.emsFire, ocr?.emsFire),
emsSpill: firstString(base.emsSpill, pdf?.emsSpill, ocr?.emsSpill),
emsFirstAid: firstString(base.emsFirstAid, pdf?.emsFirstAid, ocr?.emsFirstAid),
};
}
function main() {
if (!existsSync(BASE_PATH)) {
console.error(`base.json 없음: ${BASE_PATH}`);
console.error('→ extract-excel.py 를 먼저 실행하세요.');
process.exit(1);
}
const base: BaseRecord[] = JSON.parse(readFileSync(BASE_PATH, 'utf-8'));
// PDF 데이터 로드
const pdfRaw: Record<string, PdfResult> = existsSync(PDF_PATH)
? JSON.parse(readFileSync(PDF_PATH, 'utf-8'))
: {};
// OCR 데이터 로드
const ocr: Record<string, OcrResult> = existsSync(OCR_PATH)
? JSON.parse(readFileSync(OCR_PATH, 'utf-8'))
: {};
console.log(
`[입력] base ${base.length}종, pdf ${Object.keys(pdfRaw).length}종, ocr ${Object.keys(ocr).length}`,
);
// ── PDF 인덱스 구축 ─────────────────────────────────────────────────
// 1) nameKr 정규화 인덱스
const pdfByName = new Map<string, PdfResult>();
// 2) CAS 번호 인덱스
const pdfByCas = new Map<string, PdfResult>();
for (const [key, value] of Object.entries(pdfRaw)) {
const normKey = normalizeName(key);
if (normKey) pdfByName.set(normKey, value);
const cas = normalizeCas(value.casNumber);
if (cas) {
if (!pdfByCas.has(cas)) pdfByCas.set(cas, value);
}
}
// ── OCR 인덱스 구축 ─────────────────────────────────────────────────
const ocrByName = new Map<string, OcrResult>();
const ocrNormToOrig = new Map<string, string>();
for (const [key, value] of Object.entries(ocr)) {
const normKey = normalizeName(key);
if (normKey) {
ocrByName.set(normKey, value);
ocrNormToOrig.set(normKey, key);
}
}
// ── 병합 ──────────────────────────────────────────────────────────
let pdfMatchedByName = 0;
let pdfMatchedByCas = 0;
let pdfMatchedBySynonym = 0;
let ocrMatched = 0;
const pdfUnmatched = new Set(Object.keys(pdfRaw));
const ocrUnmatched = new Set(ocrByName.keys());
const merged = base.map((record) => {
let pdfResult: PdfResult | undefined;
let ocrResult: OcrResult | undefined;
// ── PDF 매칭 ────────────────────────────────────────────────────
// 1. CAS 번호 매칭 (가장 정확)
const baseCas = normalizeCas(record.casNumber);
if (baseCas) {
pdfResult = pdfByCas.get(baseCas);
if (pdfResult) {
pdfMatchedByCas++;
const origKey = pdfResult.nameKr as string | undefined;
if (origKey) pdfUnmatched.delete(origKey);
}
}
// 2. nameKr 정규화 매칭
if (!pdfResult) {
const normKr = normalizeName(record.nameKr);
pdfResult = pdfByName.get(normKr);
if (pdfResult) {
pdfMatchedByName++;
const origKey = pdfResult.nameKr as string | undefined;
if (origKey) pdfUnmatched.delete(origKey);
}
}
// 3. synonymsKr 동의어 매칭
if (!pdfResult && record.synonymsKr) {
const synonyms = record.synonymsKr.split(' / ');
for (const syn of synonyms) {
const normSyn = normalizeName(syn);
if (!normSyn) continue;
pdfResult = pdfByName.get(normSyn);
if (pdfResult) {
pdfMatchedBySynonym++;
const origKey = pdfResult.nameKr as string | undefined;
if (origKey) pdfUnmatched.delete(origKey);
break;
}
}
}
// ── OCR 매칭 (PDF 없는 경우 보조) ────────────────────────────────
const normKr = normalizeName(record.nameKr);
const ocrByNameResult = ocrByName.get(normKr);
if (ocrByNameResult) {
ocrResult = ocrByNameResult;
ocrMatched++;
ocrUnmatched.delete(normKr);
}
if (!ocrResult && record.synonymsKr) {
const synonyms = record.synonymsKr.split(' / ');
for (const syn of synonyms) {
const normSyn = normalizeName(syn);
if (!normSyn) continue;
const synOcrResult = ocrByName.get(normSyn);
if (synOcrResult) {
ocrResult = synOcrResult;
ocrMatched++;
ocrUnmatched.delete(normSyn);
break;
}
}
}
return merge(record, pdfResult, ocrResult);
});
// ── 통계 출력 ──────────────────────────────────────────────────────
const pdfTotal = pdfMatchedByCas + pdfMatchedByName + pdfMatchedBySynonym;
console.log(
`[PDF 매칭] 총 ${pdfTotal}종 (CAS: ${pdfMatchedByCas}, 국문명: ${pdfMatchedByName}, 동의어: ${pdfMatchedBySynonym})`,
);
console.log(`[OCR 매칭] ${ocrMatched}`);
if (pdfUnmatched.size > 0) {
const unmatchedList = Array.from(pdfUnmatched).sort();
const unmatchedPath = resolve(OUT_DIR, 'pdf-unmatched.json');
writeFileSync(
unmatchedPath,
JSON.stringify({ count: unmatchedList.length, keys: unmatchedList }, null, 2),
'utf-8',
);
console.warn(
`[경고] PDF 매칭 실패 ${unmatchedList.length}개 → ${unmatchedPath}`,
);
unmatchedList.slice(0, 10).forEach((k) => console.warn(` - ${k}`));
if (unmatchedList.length > 10) console.warn(` ... +${unmatchedList.length - 10}`);
}
writeFileSync(TARGET_PATH, JSON.stringify(merged, null, 2), 'utf-8');
const sizeKb = (JSON.stringify(merged).length / 1024).toFixed(0);
console.log(`[완료] ${TARGET_PATH} (${sizeKb} KB, ${merged.length}종)`);
console.log(` 상세 정보 보유: ${merged.filter((r) => r.flashPoint).length}`);
console.log(` NFPA 있음: ${merged.filter((r) => r.nfpa.health || r.nfpa.fire || r.nfpa.reactivity).length}`);
}
main();