wing-ops/backend/scripts/hns-import/merge-data.ts

252 lines
8.4 KiB
TypeScript

/**
* base.json + ocr.json → frontend/src/data/hnsSubstanceData.json
*
* 매칭 키: 국문명(nameKr) 정규화 비교 (공백/특수문자 제거 후 소문자 비교)
* 병합 규칙: Excel 기본 필드 유지, OCR 결과는 빈 필드만 채움 (OCR이 우선이지 않음)
* 실제로 물성/위험도 필드는 base.json 에서 대부분 비어있으므로 OCR 값으로 채워짐.
*/
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
import { resolve, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const OUT_DIR = resolve(__dirname, 'out');
const BASE_PATH = resolve(OUT_DIR, 'base.json');
const OCR_PATH = resolve(OUT_DIR, 'ocr.json');
const TARGET_PATH = resolve(__dirname, '../../../frontend/src/data/hnsSubstanceData.json');
function normalizeName(s: string | undefined): string {
if (!s) return '';
return s
.replace(/\s+/g, '')
.replace(/[,.·/\-_()[\]]/g, '')
.toLowerCase();
}
interface NfpaBlock {
health: number;
fire: number;
reactivity: number;
special: string;
}
interface MsdsBlock {
hazard: string;
firstAid: string;
fireFighting: string;
spillResponse: string;
exposure: string;
regulation: string;
}
interface BaseRecord {
id: number;
abbreviation: string;
nameKr: string;
nameEn: string;
synonymsEn: string;
synonymsKr: string;
unNumber: string;
casNumber: string;
transportMethod: string;
sebc: string;
usage: string;
state: string;
color: string;
odor: string;
flashPoint: string;
autoIgnition: string;
boilingPoint: string;
density: string;
solubility: string;
vaporPressure: string;
vaporDensity: string;
explosionRange: string;
nfpa: NfpaBlock;
hazardClass: string;
ergNumber: string;
idlh: string;
aegl2: string;
erpg2: string;
responseDistanceFire: string;
responseDistanceSpillDay: string;
responseDistanceSpillNight: string;
marineResponse: string;
ppeClose: string;
ppeFar: string;
msds: MsdsBlock;
ibcHazard: string;
ibcShipType: string;
ibcTankType: string;
ibcDetection: string;
ibcFireFighting: string;
ibcMinRequirement: string;
emsCode: string;
emsFire: string;
emsSpill: string;
emsFirstAid: string;
cargoCodes: Array<{ code: string; name: string; company: string; source: string }>;
portFrequency: Array<{ port: string; portCode: string; lastImport: string; frequency: string }>;
}
interface OcrResult {
[key: string]: unknown;
}
function firstString(...values: Array<unknown>): string {
for (const v of values) {
if (typeof v === 'string' && v.trim().length > 0) return v.trim();
}
return '';
}
function pickNfpa(ocr: OcrResult): NfpaBlock | null {
const n = ocr.nfpa as Partial<NfpaBlock> | undefined;
if (!n || typeof n !== 'object') return null;
const h = Number(n.health);
const f = Number(n.fire);
const r = Number(n.reactivity);
if ([h, f, r].some((x) => !Number.isFinite(x))) return null;
return {
health: h,
fire: f,
reactivity: r,
special: typeof n.special === 'string' ? n.special : '',
};
}
function pickMsds(ocr: OcrResult, base: MsdsBlock): MsdsBlock {
const m = (ocr.msds ?? {}) as Partial<MsdsBlock>;
return {
hazard: firstString(base.hazard, m.hazard),
firstAid: firstString(base.firstAid, m.firstAid),
fireFighting: firstString(base.fireFighting, m.fireFighting),
spillResponse: firstString(base.spillResponse, m.spillResponse),
exposure: firstString(base.exposure, m.exposure),
regulation: firstString(base.regulation, m.regulation),
};
}
function merge(base: BaseRecord, ocr: OcrResult | undefined): BaseRecord {
if (!ocr) return base;
const nfpaFromOcr = pickNfpa(ocr);
return {
...base,
transportMethod: firstString(base.transportMethod, ocr.transportMethod),
sebc: firstString(base.sebc, ocr.sebc),
state: firstString(base.state, ocr.state),
color: firstString(base.color, ocr.color),
odor: firstString(base.odor, ocr.odor),
flashPoint: firstString(base.flashPoint, ocr.flashPoint),
autoIgnition: firstString(base.autoIgnition, ocr.autoIgnition),
boilingPoint: firstString(base.boilingPoint, ocr.boilingPoint),
density: firstString(base.density, ocr.density),
solubility: firstString(base.solubility, ocr.solubility),
vaporPressure: firstString(base.vaporPressure, ocr.vaporPressure),
vaporDensity: firstString(base.vaporDensity, ocr.vaporDensity),
explosionRange: firstString(base.explosionRange, ocr.explosionRange),
nfpa: nfpaFromOcr ?? base.nfpa,
hazardClass: firstString(base.hazardClass, ocr.hazardClass),
ergNumber: firstString(base.ergNumber, ocr.ergNumber),
idlh: firstString(base.idlh, ocr.idlh),
aegl2: firstString(base.aegl2, ocr.aegl2),
erpg2: firstString(base.erpg2, ocr.erpg2),
responseDistanceFire: firstString(base.responseDistanceFire, ocr.responseDistanceFire),
responseDistanceSpillDay: firstString(base.responseDistanceSpillDay, ocr.responseDistanceSpillDay),
responseDistanceSpillNight: firstString(base.responseDistanceSpillNight, ocr.responseDistanceSpillNight),
marineResponse: firstString(base.marineResponse, ocr.marineResponse),
ppeClose: firstString(base.ppeClose, ocr.ppeClose),
ppeFar: firstString(base.ppeFar, ocr.ppeFar),
msds: pickMsds(ocr, base.msds),
emsCode: firstString(base.emsCode, ocr.emsCode),
emsFire: firstString(base.emsFire, ocr.emsFire),
emsSpill: firstString(base.emsSpill, ocr.emsSpill),
emsFirstAid: firstString(base.emsFirstAid, ocr.emsFirstAid),
};
}
function main() {
if (!existsSync(BASE_PATH)) {
console.error(`base.json 없음: ${BASE_PATH}`);
console.error('→ extract-excel.py 를 먼저 실행하세요.');
process.exit(1);
}
if (!existsSync(OCR_PATH)) {
console.warn(`ocr.json 없음: ${OCR_PATH} — 상세 데이터 없이 base 만 사용`);
}
const base: BaseRecord[] = JSON.parse(readFileSync(BASE_PATH, 'utf-8'));
const ocr: Record<string, OcrResult> = existsSync(OCR_PATH)
? JSON.parse(readFileSync(OCR_PATH, 'utf-8'))
: {};
console.log(`[입력] base ${base.length}종, ocr ${Object.keys(ocr).length}`);
// OCR 키를 정규화 인덱스로 변환 (정규화키 → OcrResult, 역매핑 normKey → 원본키)
const ocrIndex = new Map<string, OcrResult>();
const normToOrig = new Map<string, string>();
for (const [key, value] of Object.entries(ocr)) {
const normKey = normalizeName(key);
if (normKey) {
ocrIndex.set(normKey, value);
normToOrig.set(normKey, key);
}
}
let matched = 0;
let matchedBySynonym = 0;
const unmatched: string[] = [];
const merged = base.map((record) => {
// 1단계: nameKr 정규화 매칭
const key = normalizeName(record.nameKr);
const ocrResult = ocrIndex.get(key);
if (ocrResult) {
matched++;
ocrIndex.delete(key);
return merge(record, ocrResult);
}
// 2단계: synonymsKr 동의어 매칭 (" / " 구분자)
if (record.synonymsKr) {
const synonyms = record.synonymsKr.split(' / ');
for (const syn of synonyms) {
const normSyn = normalizeName(syn);
if (!normSyn) continue;
const synOcrResult = ocrIndex.get(normSyn);
if (synOcrResult) {
matched++;
matchedBySynonym++;
ocrIndex.delete(normSyn);
return merge(record, synOcrResult);
}
}
}
return record;
});
// 남은 OCR 키는 base에 매칭 실패한 항목 (원본 키로 복원)
for (const normKey of ocrIndex.keys()) {
unmatched.push(normToOrig.get(normKey) ?? normKey);
}
console.log(`[병합] base ↔ ocr 매칭 ${matched}종 (nameKr: ${matched - matchedBySynonym}, 동의어: ${matchedBySynonym})`);
if (unmatched.length > 0) {
const unmatchedPath = resolve(OUT_DIR, 'merge-unmatched.json');
writeFileSync(unmatchedPath, JSON.stringify({ count: unmatched.length, keys: unmatched.sort() }, null, 2), 'utf-8');
console.warn(`[경고] OCR 매칭 실패 ${unmatched.length}개 → ${unmatchedPath}`);
unmatched.slice(0, 20).forEach((k) => console.warn(` - ${k}`));
if (unmatched.length > 20) console.warn(` ... +${unmatched.length - 20}`);
}
writeFileSync(TARGET_PATH, JSON.stringify(merged, null, 2), 'utf-8');
const sizeKb = (JSON.stringify(merged).length / 1024).toFixed(0);
console.log(`[완료] ${TARGET_PATH} (${sizeKb} KB, ${merged.length}종)`);
console.log(` 상세 정보 보유: ${merged.filter((r) => r.flashPoint).length}`);
}
main();