363 lines
13 KiB
TypeScript
363 lines
13 KiB
TypeScript
/**
|
||
* base.json + pdf-data.json + ocr.json → frontend/src/data/hnsSubstanceData.json
|
||
*
|
||
* 우선순위: pdf-data (PDF 텍스트 추출, 최고 정확도) > base.json > ocr.json (이미지 OCR, 낮은 정확도)
|
||
* 매칭 키 순서:
|
||
* 1. CAS 번호 (가장 신뢰할 수 있는 식별자)
|
||
* 2. 국문명(nameKr) 정규화 비교
|
||
* 3. 동의어(synonymsKr) 정규화 비교
|
||
*/
|
||
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
|
||
import { resolve, dirname } from 'node:path';
|
||
import { fileURLToPath } from 'node:url';
|
||
|
||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||
const OUT_DIR = resolve(__dirname, 'out');
|
||
const BASE_PATH = resolve(OUT_DIR, 'base.json');
|
||
const PDF_PATH = resolve(OUT_DIR, 'pdf-data.json');
|
||
const OCR_PATH = resolve(OUT_DIR, 'ocr.json');
|
||
const TARGET_PATH = resolve(__dirname, '../../../frontend/src/data/hnsSubstanceData.json');
|
||
|
||
function normalizeName(s: string | undefined): string {
|
||
if (!s) return '';
|
||
return s
|
||
.replace(/\s+/g, '')
|
||
.replace(/[,.·/\-_()[\]()]/g, '')
|
||
.toLowerCase();
|
||
}
|
||
|
||
function normalizeCas(s: string | undefined): string {
|
||
if (!s) return '';
|
||
// 앞자리 0 제거 후 정규화
|
||
return s
|
||
.replace(/[^0-9\-]/g, '')
|
||
.replace(/^0+/, '')
|
||
.trim();
|
||
}
|
||
|
||
interface NfpaBlock {
|
||
health: number;
|
||
fire: number;
|
||
reactivity: number;
|
||
special: string;
|
||
}
|
||
|
||
interface MsdsBlock {
|
||
hazard: string;
|
||
firstAid: string;
|
||
fireFighting: string;
|
||
spillResponse: string;
|
||
exposure: string;
|
||
regulation: string;
|
||
}
|
||
|
||
interface BaseRecord {
|
||
id: number;
|
||
abbreviation: string;
|
||
nameKr: string;
|
||
nameEn: string;
|
||
synonymsEn: string;
|
||
synonymsKr: string;
|
||
unNumber: string;
|
||
casNumber: string;
|
||
transportMethod: string;
|
||
sebc: string;
|
||
usage: string;
|
||
state: string;
|
||
color: string;
|
||
odor: string;
|
||
flashPoint: string;
|
||
autoIgnition: string;
|
||
boilingPoint: string;
|
||
density: string;
|
||
solubility: string;
|
||
vaporPressure: string;
|
||
vaporDensity: string;
|
||
explosionRange: string;
|
||
nfpa: NfpaBlock;
|
||
hazardClass: string;
|
||
ergNumber: string;
|
||
idlh: string;
|
||
aegl2: string;
|
||
erpg2: string;
|
||
responseDistanceFire: string;
|
||
responseDistanceSpillDay: string;
|
||
responseDistanceSpillNight: string;
|
||
marineResponse: string;
|
||
ppeClose: string;
|
||
ppeFar: string;
|
||
msds: MsdsBlock;
|
||
ibcHazard: string;
|
||
ibcShipType: string;
|
||
ibcTankType: string;
|
||
ibcDetection: string;
|
||
ibcFireFighting: string;
|
||
ibcMinRequirement: string;
|
||
emsCode: string;
|
||
emsFire: string;
|
||
emsSpill: string;
|
||
emsFirstAid: string;
|
||
cargoCodes: Array<{ code: string; name: string; company: string; source: string }>;
|
||
portFrequency: Array<{ port: string; portCode: string; lastImport: string; frequency: string }>;
|
||
}
|
||
|
||
interface PdfResult {
|
||
[key: string]: unknown;
|
||
casNumber?: string;
|
||
nameKr?: string;
|
||
nfpa?: Partial<NfpaBlock>;
|
||
msds?: Partial<MsdsBlock>;
|
||
}
|
||
|
||
interface OcrResult {
|
||
[key: string]: unknown;
|
||
}
|
||
|
||
function firstString(...values: Array<unknown>): string {
|
||
for (const v of values) {
|
||
if (typeof v === 'string' && v.trim().length > 0) return v.trim();
|
||
}
|
||
return '';
|
||
}
|
||
|
||
function pickNfpa(source: PdfResult | OcrResult): NfpaBlock | null {
|
||
const n = source.nfpa as Partial<NfpaBlock> | undefined;
|
||
if (!n || typeof n !== 'object') return null;
|
||
const h = Number(n.health);
|
||
const f = Number(n.fire);
|
||
const r = Number(n.reactivity);
|
||
if ([h, f, r].some((x) => !Number.isFinite(x))) return null;
|
||
return {
|
||
health: h,
|
||
fire: f,
|
||
reactivity: r,
|
||
special: typeof n.special === 'string' ? n.special : '',
|
||
};
|
||
}
|
||
|
||
function pickMsds(
|
||
pdf: PdfResult | undefined,
|
||
ocr: OcrResult | undefined,
|
||
base: MsdsBlock,
|
||
): MsdsBlock {
|
||
const p = (pdf?.msds ?? {}) as Partial<MsdsBlock>;
|
||
const o = (ocr?.msds ?? {}) as Partial<MsdsBlock>;
|
||
return {
|
||
hazard: firstString(base.hazard, p.hazard, o.hazard),
|
||
firstAid: firstString(base.firstAid, p.firstAid, o.firstAid),
|
||
fireFighting: firstString(base.fireFighting, p.fireFighting, o.fireFighting),
|
||
spillResponse: firstString(base.spillResponse, p.spillResponse, o.spillResponse),
|
||
exposure: firstString(base.exposure, p.exposure, o.exposure),
|
||
regulation: firstString(base.regulation, p.regulation, o.regulation),
|
||
};
|
||
}
|
||
|
||
function merge(
|
||
base: BaseRecord,
|
||
pdf: PdfResult | undefined,
|
||
ocr: OcrResult | undefined,
|
||
): BaseRecord {
|
||
const nfpaFromPdf = pdf ? pickNfpa(pdf) : null;
|
||
const nfpaFromOcr = ocr ? pickNfpa(ocr) : null;
|
||
// pdf NFPA 우선, 없으면 ocr, 없으면 base
|
||
const nfpa = nfpaFromPdf ?? nfpaFromOcr ?? base.nfpa;
|
||
|
||
return {
|
||
...base,
|
||
// pdf > base > ocr 우선순위
|
||
unNumber: firstString(pdf?.unNumber, base.unNumber, ocr?.unNumber),
|
||
casNumber: firstString(pdf?.casNumber, base.casNumber, ocr?.casNumber),
|
||
synonymsKr: firstString(pdf?.synonymsKr, base.synonymsKr, ocr?.synonymsKr),
|
||
transportMethod: firstString(base.transportMethod, pdf?.transportMethod, ocr?.transportMethod),
|
||
sebc: firstString(base.sebc, pdf?.sebc, ocr?.sebc),
|
||
usage: firstString(pdf?.usage, base.usage, ocr?.usage),
|
||
state: firstString(pdf?.state, base.state, ocr?.state),
|
||
color: firstString(pdf?.color, base.color, ocr?.color),
|
||
odor: firstString(pdf?.odor, base.odor, ocr?.odor),
|
||
flashPoint: firstString(pdf?.flashPoint, base.flashPoint, ocr?.flashPoint),
|
||
autoIgnition: firstString(pdf?.autoIgnition, base.autoIgnition, ocr?.autoIgnition),
|
||
boilingPoint: firstString(pdf?.boilingPoint, base.boilingPoint, ocr?.boilingPoint),
|
||
density: firstString(pdf?.density, base.density, ocr?.density),
|
||
solubility: firstString(pdf?.solubility, base.solubility, ocr?.solubility),
|
||
vaporPressure: firstString(pdf?.vaporPressure, base.vaporPressure, ocr?.vaporPressure),
|
||
vaporDensity: firstString(pdf?.vaporDensity, base.vaporDensity, ocr?.vaporDensity),
|
||
explosionRange: firstString(pdf?.explosionRange, base.explosionRange, ocr?.explosionRange),
|
||
nfpa,
|
||
hazardClass: firstString(pdf?.hazardClass, base.hazardClass, ocr?.hazardClass),
|
||
ergNumber: firstString(base.ergNumber, pdf?.ergNumber, ocr?.ergNumber),
|
||
idlh: firstString(pdf?.idlh, base.idlh, ocr?.idlh),
|
||
aegl2: firstString(base.aegl2, pdf?.aegl2, ocr?.aegl2),
|
||
erpg2: firstString(base.erpg2, pdf?.erpg2, ocr?.erpg2),
|
||
responseDistanceFire: firstString(pdf?.responseDistanceFire, base.responseDistanceFire, ocr?.responseDistanceFire),
|
||
responseDistanceSpillDay: firstString(pdf?.responseDistanceSpillDay, base.responseDistanceSpillDay, ocr?.responseDistanceSpillDay),
|
||
responseDistanceSpillNight: firstString(pdf?.responseDistanceSpillNight, base.responseDistanceSpillNight, ocr?.responseDistanceSpillNight),
|
||
marineResponse: firstString(pdf?.marineResponse, base.marineResponse, ocr?.marineResponse),
|
||
ppeClose: firstString(base.ppeClose, pdf?.ppeClose, ocr?.ppeClose),
|
||
ppeFar: firstString(base.ppeFar, pdf?.ppeFar, ocr?.ppeFar),
|
||
msds: pickMsds(pdf, ocr, base.msds),
|
||
emsCode: firstString(base.emsCode, pdf?.emsCode, ocr?.emsCode),
|
||
emsFire: firstString(base.emsFire, pdf?.emsFire, ocr?.emsFire),
|
||
emsSpill: firstString(base.emsSpill, pdf?.emsSpill, ocr?.emsSpill),
|
||
emsFirstAid: firstString(base.emsFirstAid, pdf?.emsFirstAid, ocr?.emsFirstAid),
|
||
};
|
||
}
|
||
|
||
function main() {
|
||
if (!existsSync(BASE_PATH)) {
|
||
console.error(`base.json 없음: ${BASE_PATH}`);
|
||
console.error('→ extract-excel.py 를 먼저 실행하세요.');
|
||
process.exit(1);
|
||
}
|
||
|
||
const base: BaseRecord[] = JSON.parse(readFileSync(BASE_PATH, 'utf-8'));
|
||
|
||
// PDF 데이터 로드
|
||
const pdfRaw: Record<string, PdfResult> = existsSync(PDF_PATH)
|
||
? JSON.parse(readFileSync(PDF_PATH, 'utf-8'))
|
||
: {};
|
||
|
||
// OCR 데이터 로드
|
||
const ocr: Record<string, OcrResult> = existsSync(OCR_PATH)
|
||
? JSON.parse(readFileSync(OCR_PATH, 'utf-8'))
|
||
: {};
|
||
|
||
console.log(
|
||
`[입력] base ${base.length}종, pdf ${Object.keys(pdfRaw).length}종, ocr ${Object.keys(ocr).length}종`,
|
||
);
|
||
|
||
// ── PDF 인덱스 구축 ─────────────────────────────────────────────────
|
||
// 1) nameKr 정규화 인덱스
|
||
const pdfByName = new Map<string, PdfResult>();
|
||
// 2) CAS 번호 인덱스
|
||
const pdfByCas = new Map<string, PdfResult>();
|
||
|
||
for (const [key, value] of Object.entries(pdfRaw)) {
|
||
const normKey = normalizeName(key);
|
||
if (normKey) pdfByName.set(normKey, value);
|
||
|
||
const cas = normalizeCas(value.casNumber);
|
||
if (cas) {
|
||
if (!pdfByCas.has(cas)) pdfByCas.set(cas, value);
|
||
}
|
||
}
|
||
|
||
// ── OCR 인덱스 구축 ─────────────────────────────────────────────────
|
||
const ocrByName = new Map<string, OcrResult>();
|
||
const ocrNormToOrig = new Map<string, string>();
|
||
for (const [key, value] of Object.entries(ocr)) {
|
||
const normKey = normalizeName(key);
|
||
if (normKey) {
|
||
ocrByName.set(normKey, value);
|
||
ocrNormToOrig.set(normKey, key);
|
||
}
|
||
}
|
||
|
||
// ── 병합 ──────────────────────────────────────────────────────────
|
||
let pdfMatchedByName = 0;
|
||
let pdfMatchedByCas = 0;
|
||
let pdfMatchedBySynonym = 0;
|
||
let ocrMatched = 0;
|
||
const pdfUnmatched = new Set(Object.keys(pdfRaw));
|
||
const ocrUnmatched = new Set(ocrByName.keys());
|
||
|
||
const merged = base.map((record) => {
|
||
let pdfResult: PdfResult | undefined;
|
||
let ocrResult: OcrResult | undefined;
|
||
|
||
// ── PDF 매칭 ────────────────────────────────────────────────────
|
||
// 1. CAS 번호 매칭 (가장 정확)
|
||
const baseCas = normalizeCas(record.casNumber);
|
||
if (baseCas) {
|
||
pdfResult = pdfByCas.get(baseCas);
|
||
if (pdfResult) {
|
||
pdfMatchedByCas++;
|
||
const origKey = pdfResult.nameKr as string | undefined;
|
||
if (origKey) pdfUnmatched.delete(origKey);
|
||
}
|
||
}
|
||
|
||
// 2. nameKr 정규화 매칭
|
||
if (!pdfResult) {
|
||
const normKr = normalizeName(record.nameKr);
|
||
pdfResult = pdfByName.get(normKr);
|
||
if (pdfResult) {
|
||
pdfMatchedByName++;
|
||
const origKey = pdfResult.nameKr as string | undefined;
|
||
if (origKey) pdfUnmatched.delete(origKey);
|
||
}
|
||
}
|
||
|
||
// 3. synonymsKr 동의어 매칭
|
||
if (!pdfResult && record.synonymsKr) {
|
||
const synonyms = record.synonymsKr.split(' / ');
|
||
for (const syn of synonyms) {
|
||
const normSyn = normalizeName(syn);
|
||
if (!normSyn) continue;
|
||
pdfResult = pdfByName.get(normSyn);
|
||
if (pdfResult) {
|
||
pdfMatchedBySynonym++;
|
||
const origKey = pdfResult.nameKr as string | undefined;
|
||
if (origKey) pdfUnmatched.delete(origKey);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// ── OCR 매칭 (PDF 없는 경우 보조) ────────────────────────────────
|
||
const normKr = normalizeName(record.nameKr);
|
||
const ocrByNameResult = ocrByName.get(normKr);
|
||
if (ocrByNameResult) {
|
||
ocrResult = ocrByNameResult;
|
||
ocrMatched++;
|
||
ocrUnmatched.delete(normKr);
|
||
}
|
||
|
||
if (!ocrResult && record.synonymsKr) {
|
||
const synonyms = record.synonymsKr.split(' / ');
|
||
for (const syn of synonyms) {
|
||
const normSyn = normalizeName(syn);
|
||
if (!normSyn) continue;
|
||
const synOcrResult = ocrByName.get(normSyn);
|
||
if (synOcrResult) {
|
||
ocrResult = synOcrResult;
|
||
ocrMatched++;
|
||
ocrUnmatched.delete(normSyn);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
return merge(record, pdfResult, ocrResult);
|
||
});
|
||
|
||
// ── 통계 출력 ──────────────────────────────────────────────────────
|
||
const pdfTotal = pdfMatchedByCas + pdfMatchedByName + pdfMatchedBySynonym;
|
||
console.log(
|
||
`[PDF 매칭] 총 ${pdfTotal}종 (CAS: ${pdfMatchedByCas}, 국문명: ${pdfMatchedByName}, 동의어: ${pdfMatchedBySynonym})`,
|
||
);
|
||
console.log(`[OCR 매칭] ${ocrMatched}종`);
|
||
|
||
if (pdfUnmatched.size > 0) {
|
||
const unmatchedList = Array.from(pdfUnmatched).sort();
|
||
const unmatchedPath = resolve(OUT_DIR, 'pdf-unmatched.json');
|
||
writeFileSync(
|
||
unmatchedPath,
|
||
JSON.stringify({ count: unmatchedList.length, keys: unmatchedList }, null, 2),
|
||
'utf-8',
|
||
);
|
||
console.warn(
|
||
`[경고] PDF 매칭 실패 ${unmatchedList.length}개 → ${unmatchedPath}`,
|
||
);
|
||
unmatchedList.slice(0, 10).forEach((k) => console.warn(` - ${k}`));
|
||
if (unmatchedList.length > 10) console.warn(` ... +${unmatchedList.length - 10}`);
|
||
}
|
||
|
||
writeFileSync(TARGET_PATH, JSON.stringify(merged, null, 2), 'utf-8');
|
||
const sizeKb = (JSON.stringify(merged).length / 1024).toFixed(0);
|
||
console.log(`[완료] ${TARGET_PATH} (${sizeKb} KB, ${merged.length}종)`);
|
||
console.log(` 상세 정보 보유: ${merged.filter((r) => r.flashPoint).length}종`);
|
||
console.log(` NFPA 있음: ${merged.filter((r) => r.nfpa.health || r.nfpa.fire || r.nfpa.reactivity).length}종`);
|
||
}
|
||
|
||
main();
|