summaryrefslogtreecommitdiff
path: root/app/api/ocr/utils/tableExtraction.ts
diff options
context:
space:
mode:
authordujinkim <dujin.kim@dtsolution.co.kr>2025-06-11 12:18:38 +0000
committerdujinkim <dujin.kim@dtsolution.co.kr>2025-06-11 12:18:38 +0000
commitff902243a658067fae858a615c0629aa2e0a4837 (patch)
tree42d30e986d1cbfb282c644c01730cd053b816b7a /app/api/ocr/utils/tableExtraction.ts
parent42e38f41cb4c0b4bf9c08b71ed087cd7f0c7fc18 (diff)
(대표님) 20250611 21시 15분 OCR 등
Diffstat (limited to 'app/api/ocr/utils/tableExtraction.ts')
-rw-r--r--app/api/ocr/utils/tableExtraction.ts611
1 files changed, 161 insertions, 450 deletions
diff --git a/app/api/ocr/utils/tableExtraction.ts b/app/api/ocr/utils/tableExtraction.ts
index ea543f8e..720e5a5f 100644
--- a/app/api/ocr/utils/tableExtraction.ts
+++ b/app/api/ocr/utils/tableExtraction.ts
@@ -1,7 +1,11 @@
// app/api/ocr/utils/tableExtraction.ts
-// 완전한 테이블 추출 로직 구현
+// 개선된 완전 테이블 추출 로직 – Format‑1 식별번호 파싱 보강 & 중복 행 제거
-interface ExtractedRow {
+/* -------------------------------------------------------------------------- */
+/* 타입 */
+/* -------------------------------------------------------------------------- */
+
+export interface ExtractedRow {
no: string;
identificationNo: string;
tagNo: string;
@@ -41,516 +45,223 @@ interface ColumnMapping {
weldingDate: number;
}
-// 메인 테이블 추출 함수
-export async function extractTablesFromOCR(ocrResult: any): Promise<ExtractedRow[][]> {
- const extractedTables: ExtractedRow[][] = [];
- const warnings: string[] = [];
+/* -------------------------------------------------------------------------- */
+/* 메인 */
+/* -------------------------------------------------------------------------- */
- if (!ocrResult || !ocrResult.images) {
- console.warn('No OCR images found in result');
- return [];
- }
+export async function extractTablesFromOCR (ocrResult: any): Promise<ExtractedRow[][]> {
+ const tables: ExtractedRow[][] = [];
+ if (!ocrResult?.images) return tables;
- for (let imageIndex = 0; imageIndex < ocrResult.images.length; imageIndex++) {
- const image = ocrResult.images[imageIndex];
-
- if (!image.tables || image.tables.length === 0) {
- console.warn(`No tables found in image ${imageIndex}`);
- continue;
- }
+ ocrResult.images.forEach((image: any, imgIdx: number) => {
+ image.tables?.forEach((table: OCRTable, tblIdx: number) => {
+ if (!isRelevantTable(table)) return;
+ const rows = extractTableData(table, imgIdx, tblIdx);
+ if (rows.length) tables.push(rows);
+ });
+ });
+ return tables;
+}
- for (let tableIndex = 0; tableIndex < image.tables.length; tableIndex++) {
- const table = image.tables[tableIndex];
-
- try {
- if (isRelevantTable(table)) {
- const extractedRows = extractTableData(table, imageIndex, tableIndex);
-
- if (extractedRows.length > 0) {
- extractedTables.push(extractedRows);
- console.log(`Successfully extracted ${extractedRows.length} rows from table ${tableIndex + 1} in image ${imageIndex + 1}`);
- } else {
- console.warn(`Table ${tableIndex + 1} in image ${imageIndex + 1} was identified as relevant but no data could be extracted`);
- }
- } else {
- console.log(`Table ${tableIndex + 1} in image ${imageIndex + 1} is not relevant (no required headers found)`);
- }
- } catch (error) {
- console.error(`Error processing table ${tableIndex + 1} in image ${imageIndex + 1}:`, error);
- }
- }
- }
+/* -------------------------------------------------------------------------- */
+/* 관련 테이블 판별 */
+/* -------------------------------------------------------------------------- */
- console.log(`Total extracted tables: ${extractedTables.length}`);
- return extractedTables;
+function isRelevantTable (table: OCRTable): boolean {
+ const headers = table.cells.filter(c => c.rowIndex < 3).map(getCellText).join(' ').toLowerCase();
+ return /\bno\b|번호/.test(headers) && /identification|식별|ident|id/.test(headers);
}
-// 관련 테이블인지 확인
-function isRelevantTable(table: OCRTable): boolean {
- if (!table.cells || table.cells.length === 0) {
- return false;
- }
+/* -------------------------------------------------------------------------- */
+/* 표 해석 */
+/* -------------------------------------------------------------------------- */
- // 첫 3행에서 헤더 찾기
- const headerCells = table.cells.filter(cell => cell.rowIndex <= 2);
- const headerTexts = headerCells
- .map(cell => getCellText(cell).toLowerCase())
- .filter(text => text.length > 0);
-
- console.log('Header texts found:', headerTexts);
-
- // 필수 키워드 확인
- const hasNo = headerTexts.some(text =>
- text.includes('no.') ||
- text === 'no' ||
- text.includes('번호') ||
- text.match(/^no\.?$/i)
- );
-
- const hasIdentification = headerTexts.some(text =>
- text.includes('identification') ||
- text.includes('식별') ||
- text.includes('ident') ||
- text.includes('id')
- );
-
- // 테이블 품질 확인
- const hasMinimumCells = table.cells.length >= 6; // 최소 헤더 + 데이터
- const hasReasonableConfidence = table.inferConfidence >= 0.5; // 신뢰도 기준 완화
-
- const isRelevant = hasNo && hasIdentification && hasMinimumCells && hasReasonableConfidence;
-
- console.log(`Table relevance check: hasNo=${hasNo}, hasIdentification=${hasIdentification}, minCells=${hasMinimumCells}, confidence=${hasReasonableConfidence} => ${isRelevant}`);
-
- return isRelevant;
-}
+function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): ExtractedRow[] {
+ const grid = buildGrid(table);
+ const headerRowIdx = findHeaderRow(grid);
+ if (headerRowIdx === -1) return [];
-// 테이블 데이터 추출
-function extractTableData(table: OCRTable, imageIndex: number, tableIndex: number): ExtractedRow[] {
- console.log(`Processing table ${tableIndex + 1} in image ${imageIndex + 1}`);
-
- // 테이블 그리드 구축
- const tableGrid = buildTableGrid(table);
-
- if (tableGrid.length < 2) {
- console.warn('Table has less than 2 rows (need header + data)');
- return [];
- }
+ const format = detectFormat(grid[headerRowIdx]);
+ const mapping = mapColumns(grid[headerRowIdx]);
- console.log(`Table grid built: ${tableGrid.length} rows, ${tableGrid[0]?.length || 0} columns`);
+ const seen = new Set<string>();
+ const data: ExtractedRow[] = [];
- // 헤더 행 찾기
- const headerRowIndex = findHeaderRow(tableGrid);
- if (headerRowIndex === -1) {
- console.warn('No header row found');
- return [];
- }
+ for (let r = headerRowIdx + 1; r < grid.length; r++) {
+ const row = grid[r];
+ if (isBlankRow(row)) continue;
- console.log(`Header row found at index: ${headerRowIndex}`);
-
- // 테이블 형식 결정
- const headerRow = tableGrid[headerRowIndex];
- const tableFormat = determineTableFormat(headerRow);
- console.log(`Table format detected: ${tableFormat}`);
-
- // 컬럼 매핑 찾기
- const columnMapping = findColumnMapping(headerRow, tableFormat);
- console.log('Column mapping:', columnMapping);
-
- // 데이터 행 추출
- const dataRows: ExtractedRow[] = [];
-
- for (let i = headerRowIndex + 1; i < tableGrid.length; i++) {
- const row = tableGrid[i];
-
- if (row && row.length > 0 && !isEmptyRow(row)) {
- try {
- const extractedRow = extractRowData(row, tableFormat, columnMapping, imageIndex, tableIndex, i);
- if (extractedRow && isValidRow(extractedRow)) {
- dataRows.push(extractedRow);
- }
- } catch (error) {
- console.warn(`Error processing row ${i}:`, error);
- }
- }
+ const parsed = buildRow(row, format, mapping, tblIdx, r);
+ if (!parsed || !isValidRow(parsed)) continue;
+
+ const key = `${parsed.no}-${parsed.identificationNo}`;
+ if (seen.has(key)) continue;
+ seen.add(key);
+
+ data.push(parsed);
}
-
- console.log(`Extracted ${dataRows.length} valid rows from table`);
- return dataRows;
+ return data;
}
-// 테이블 그리드 구축
-function buildTableGrid(table: OCRTable): string[][] {
- if (!table.cells || table.cells.length === 0) {
- return [];
- }
+/* -------------------------------------------------------------------------- */
+/* Grid & Header */
+/* -------------------------------------------------------------------------- */
+
+function buildGrid (table: OCRTable): string[][] {
+ const maxR = Math.max(...table.cells.map(c => c.rowIndex + c.rowSpan - 1));
+ const maxC = Math.max(...table.cells.map(c => c.columnIndex + c.columnSpan - 1));
+ const grid = Array.from({ length: maxR + 1 }, () => Array(maxC + 1).fill(''));
- const maxRow = Math.max(...table.cells.map(cell => cell.rowIndex + cell.rowSpan - 1)) + 1;
- const maxCol = Math.max(...table.cells.map(cell => cell.columnIndex + cell.columnSpan - 1)) + 1;
-
- const grid: string[][] = Array(maxRow).fill(null).map(() => Array(maxCol).fill(''));
-
- // 셀 내용으로 그리드 채우기
table.cells.forEach(cell => {
- const text = getCellText(cell);
-
+ const txt = getCellText(cell);
for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) {
for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) {
- if (grid[r] && grid[r][c] !== undefined) {
- // 기존 텍스트가 있으면 결합
- grid[r][c] = grid[r][c] ? `${grid[r][c]} ${text}`.trim() : text;
- }
+ grid[r][c] = grid[r][c] ? `${grid[r][c]} ${txt}` : txt;
}
}
});
-
return grid;
}
-// 셀 텍스트 추출
-function getCellText(cell: TableCell): string {
- if (!cell.cellTextLines || cell.cellTextLines.length === 0) {
- return '';
- }
-
- return cell.cellTextLines
- .map(line =>
- line.cellWords
- .map(word => word.inferText || '')
- .join(' ')
- )
- .join('\n')
- .trim();
+function getCellText (cell: TableCell): string {
+ return cell.cellTextLines?.flatMap(l => l.cellWords.map(w => w.inferText.trim())).filter(Boolean).join(' ') ?? '';
}
-// 헤더 행 찾기
-function findHeaderRow(tableGrid: string[][]): number {
- for (let i = 0; i < Math.min(3, tableGrid.length); i++) {
- const row = tableGrid[i];
- const rowText = row.join(' ').toLowerCase();
-
- console.log(`Checking row ${i}: "${rowText}"`);
-
- const hasNo = rowText.includes('no.') || rowText.includes('번호') || /\bno\b/.test(rowText);
- const hasIdent = rowText.includes('identification') || rowText.includes('식별') || rowText.includes('ident');
-
- if (hasNo && hasIdent) {
- console.log(`Header row found at ${i}`);
- return i;
- }
+function findHeaderRow (grid: string[][]): number {
+ for (let i = 0; i < Math.min(3, grid.length); i++) {
+ const t = grid[i].join(' ').toLowerCase();
+ if (/\bno\b|번호/.test(t) && /identification|식별|ident/.test(t)) return i;
}
return -1;
}
-// 테이블 형식 결정
-function determineTableFormat(headerRow: string[]): 'format1' | 'format2' {
- const headerText = headerRow.join(' ').toLowerCase();
-
- // Format 2: Tag No와 Joint No가 분리된 컬럼
- const hasTagNoColumn = headerText.includes('tag') && headerText.includes('no');
- const hasJointNoColumn = headerText.includes('joint') && headerText.includes('no');
-
- if (hasTagNoColumn && hasJointNoColumn) {
- return 'format2';
- }
-
- // Format 1: Identification No에 통합
- return 'format1';
-}
+/* -------------------------------------------------------------------------- */
+/* Column Mapping */
+/* -------------------------------------------------------------------------- */
-// 컬럼 매핑 찾기
-function findColumnMapping(headerRow: string[], format: 'format1' | 'format2'): ColumnMapping {
- const mapping: ColumnMapping = {
- no: -1,
- identification: -1,
- tagNo: -1,
- jointNo: -1,
- jointType: -1,
- weldingDate: -1
- };
+function detectFormat (header: string[]): 'format1' | 'format2' {
+ const h = header.join(' ').toLowerCase();
+ return h.includes('tag') && h.includes('joint') ? 'format2' : 'format1';
+}
- headerRow.forEach((header, index) => {
- const lowerHeader = header.toLowerCase().trim();
-
- console.log(`Column ${index}: "${header}" -> "${lowerHeader}"`);
-
- if ((lowerHeader.includes('no.') || lowerHeader === 'no') &&
- !lowerHeader.includes('identification') &&
- !lowerHeader.includes('tag') &&
- !lowerHeader.includes('joint')) {
- mapping.no = index;
- console.log(` -> Mapped to 'no'`);
- } else if (lowerHeader.includes('identification') || lowerHeader.includes('ident')) {
- mapping.identification = index;
- console.log(` -> Mapped to 'identification'`);
- } else if (lowerHeader.includes('tag') && lowerHeader.includes('no')) {
- mapping.tagNo = index;
- console.log(` -> Mapped to 'tagNo'`);
- } else if (lowerHeader.includes('joint') && lowerHeader.includes('no')) {
- mapping.jointNo = index;
- console.log(` -> Mapped to 'jointNo'`);
- } else if (lowerHeader.includes('joint') && lowerHeader.includes('type')) {
- mapping.jointType = index;
- console.log(` -> Mapped to 'jointType'`);
- } else if (lowerHeader.includes('type') && !lowerHeader.includes('joint')) {
- mapping.jointType = index;
- console.log(` -> Mapped to 'jointType'`);
- } else if (lowerHeader.includes('welding') || lowerHeader.includes('date')) {
- mapping.weldingDate = index;
- console.log(` -> Mapped to 'weldingDate'`);
- }
+function mapColumns (header: string[]): ColumnMapping {
+ const mp: ColumnMapping = { no: -1, identification: -1, tagNo: -1, jointNo: -1, jointType: -1, weldingDate: -1 };
+
+ header.forEach((h, i) => {
+ const t = h.toLowerCase();
+ if (/^no\.?$/.test(t) && !/ident|tag|joint/.test(t)) mp.no = i;
+ else if (/identification|ident/.test(t)) mp.identification = i;
+ else if (/tag.*no/.test(t)) mp.tagNo = i;
+ else if (/joint.*no/.test(t)) mp.jointNo = i;
+ else if (/joint.*type/.test(t) || (/^type$/.test(t) && mp.jointType === -1)) mp.jointType = i;
+ else if (/welding|date/.test(t)) mp.weldingDate = i;
});
-
- console.log('Final column mapping:', mapping);
- return mapping;
+ return mp;
}
-// 행 데이터 추출
-function extractRowData(
- row: string[],
- format: 'format1' | 'format2',
- columnMapping: ColumnMapping,
- imageIndex: number,
- tableIndex: number,
- rowIndex: number
+/* -------------------------------------------------------------------------- */
+/* Row Extraction */
+/* -------------------------------------------------------------------------- */
+
+function buildRow (
+ row: string[],
+ format: 'format1' | 'format2',
+ mp: ColumnMapping,
+ tblIdx: number,
+ rowIdx: number
): ExtractedRow | null {
-
- const extractedRow: ExtractedRow = {
- no: '',
+ const out: ExtractedRow = {
+ no: mp.no >= 0 ? clean(row[mp.no]) : '',
identificationNo: '',
tagNo: '',
jointNo: '',
- jointType: '',
+ jointType: mp.jointType >= 0 ? clean(row[mp.jointType]) : '',
weldingDate: '',
confidence: 0,
- sourceTable: tableIndex,
- sourceRow: rowIndex
+ sourceTable: tblIdx,
+ sourceRow: rowIdx,
};
- console.log(`Processing row ${rowIndex}: [${row.map(cell => `"${cell}"`).join(', ')}]`);
-
- // No. 추출
- if (columnMapping.no >= 0 && columnMapping.no < row.length) {
- extractedRow.no = cleanText(row[columnMapping.no]);
+ if (mp.weldingDate >= 0) out.weldingDate = clean(row[mp.weldingDate]);
+ else {
+ const idx = row.findIndex(col => /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/.test(col));
+ if (idx >= 0) out.weldingDate = clean(row[idx]);
}
- if (format === 'format1') {
- // Format 1: 통합된 identification 데이터
- if (columnMapping.identification >= 0 && columnMapping.identification < row.length) {
- const combinedText = row[columnMapping.identification];
- const parsedData = parseIdentificationData(combinedText);
- extractedRow.identificationNo = parsedData.identificationNo;
- extractedRow.tagNo = parsedData.tagNo;
- extractedRow.jointNo = parsedData.jointNo;
-
- console.log(` Parsed identification: "${combinedText}" -> `, parsedData);
- }
+ if (format === 'format2') {
+ if (mp.identification >= 0) out.identificationNo = clean(row[mp.identification]);
+ if (mp.jointNo >= 0) out.jointNo = clean(row[mp.jointNo]);
+ if (mp.tagNo >= 0) out.tagNo = clean(row[mp.tagNo]);
} else {
- // Format 2: 분리된 컬럼들
- if (columnMapping.identification >= 0 && columnMapping.identification < row.length) {
- extractedRow.identificationNo = cleanText(row[columnMapping.identification]);
- }
- if (columnMapping.tagNo >= 0 && columnMapping.tagNo < row.length) {
- extractedRow.tagNo = cleanText(row[columnMapping.tagNo]);
- }
- if (columnMapping.jointNo >= 0 && columnMapping.jointNo < row.length) {
- extractedRow.jointNo = cleanText(row[columnMapping.jointNo]);
- }
+ const combined = mp.identification >= 0 ? row[mp.identification] : '';
+ const parsed = parseIdentificationData(combined);
+ out.identificationNo = parsed.identificationNo;
+ out.jointNo = parsed.jointNo;
+ out.tagNo = parsed.tagNo;
}
- // Joint Type 추출
- if (columnMapping.jointType >= 0 && columnMapping.jointType < row.length) {
- extractedRow.jointType = cleanText(row[columnMapping.jointType]);
- }
-
- // Welding Date 추출 (컬럼 매핑이 있으면 사용, 없으면 날짜 패턴으로 찾기)
- if (columnMapping.weldingDate >= 0 && columnMapping.weldingDate < row.length) {
- extractedRow.weldingDate = cleanText(row[columnMapping.weldingDate]);
- } else {
- const dateIndex = findDateColumn(row);
- if (dateIndex >= 0) {
- extractedRow.weldingDate = cleanText(row[dateIndex]);
- }
- }
-
- // 신뢰도 계산
- extractedRow.confidence = calculateRowConfidence(extractedRow);
-
- console.log(` Extracted row:`, extractedRow);
-
- return extractedRow;
+ out.confidence = scoreRow(out);
+ return out;
}
-// Identification 데이터 파싱 (Format 1용)
-function parseIdentificationData(combinedText: string): {
- identificationNo: string;
- tagNo: string;
- jointNo: string;
-} {
- const cleanedText = cleanText(combinedText);
-
- console.log(`Parsing identification data: "${cleanedText}"`);
-
- // 줄바꿈으로 먼저 분리
- const lines = cleanedText.split(/[\r\n]+/).map(line => line.trim()).filter(line => line.length > 0);
-
- const allParts: string[] = [];
- lines.forEach(line => {
- // 공백과 특수문자로 분리
- const parts = line.split(/[\s\-_]+/).filter(part => part.length > 0);
- allParts.push(...parts);
- });
+/* -------------------------------------------------------------------------- */
+/* Format‑1 셀 파싱 */
+/* -------------------------------------------------------------------------- */
- console.log(` Split into parts:`, allParts);
+function parseIdentificationData (txt: string): { identificationNo: string; jointNo: string; tagNo: string } {
+ const cleaned = clean(txt);
+ if (!cleaned) return { identificationNo: '', jointNo: '', tagNo: '' };
- if (allParts.length === 0) {
- return { identificationNo: cleanedText, tagNo: '', jointNo: '' };
- }
+ const tokens = cleaned.split(/\s+/).map(clean).filter(Boolean);
- if (allParts.length === 1) {
- return { identificationNo: allParts[0], tagNo: '', jointNo: '' };
- }
+ // Identification 후보: 하이픈이 2개 이상 포함된 토큰 가운데 가장 긴 것
+ const idCand = tokens.filter(t => t.split('-').length >= 3).sort((a, b) => b.length - a.length);
+ const identificationNo = idCand[0] || '';
- // 길이별로 정렬하여 식별
- const sortedParts = [...allParts].sort((a, b) => b.length - a.length);
-
- const identificationNo = sortedParts[0]; // 가장 긴 것
- const jointNo = allParts.find(part => part.length <= 3 && /^[A-Z0-9]+$/i.test(part)) ||
- sortedParts[sortedParts.length - 1]; // 3글자 이하 영숫자 또는 가장 짧은 것
- const tagNo = allParts.find(part => part !== identificationNo && part !== jointNo) || '';
-
- const result = { identificationNo, tagNo, jointNo };
- console.log(` Parsed result:`, result);
-
- return result;
-}
+ const residual = tokens.filter(t => t !== identificationNo);
+ if (!residual.length) return { identificationNo, jointNo: '', tagNo: '' };
-// 날짜 컬럼 찾기
-function findDateColumn(row: string[]): number {
- const datePattern = /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/;
-
- for (let i = 0; i < row.length; i++) {
- if (datePattern.test(row[i])) {
- console.log(` Found date in column ${i}: "${row[i]}"`);
- return i;
- }
- }
-
- return -1;
-}
+ residual.sort((a, b) => a.length - b.length);
+ const jointNo = residual[0] || '';
+ const tagNo = residual[residual.length - 1] || '';
-// 텍스트 정리
-function cleanText(text: string): string {
- return text
- .replace(/[\r\n\t]+/g, ' ')
- .replace(/\s+/g, ' ')
- .trim();
+ return { identificationNo, jointNo, tagNo };
}
-// 빈 행 확인
-function isEmptyRow(row: string[]): boolean {
- return row.every(cell => !cell || cell.trim().length === 0);
+/* -------------------------------------------------------------------------- */
+/* Helpers */
+/* -------------------------------------------------------------------------- */
+
+const clean = (s: string = '') => s.replace(/[\r\n\t]+/g, ' ').replace(/\s+/g, ' ').trim();
+const isBlankRow = (row: string[]) => row.every(c => !clean(c));
+const isValidRow = (r: ExtractedRow) => !!(r.no || r.identificationNo);
+
+function scoreRow (r: ExtractedRow): number {
+ const w: Record<keyof ExtractedRow, number> = {
+ no: 1, identificationNo: 3, tagNo: 2, jointNo: 2, jointType: 1, weldingDate: 1,
+ confidence: 0, sourceTable: 0, sourceRow: 0,
+ } as any;
+ let s = 0, t = 0;
+ (Object.keys(w) as (keyof ExtractedRow)[]).forEach(k => { t += w[k]; if ((r[k] as string)?.length) s += w[k]; });
+ return t ? s / t : 0;
}
-// 유효한 행 확인
-function isValidRow(row: ExtractedRow): boolean {
- // 번호나 식별번호 중 하나라도 있으면 유효
- const hasBasicData = !!(row.no || row.identificationNo);
-
- // 너무 짧은 데이터는 제외 (오인식 방지)
- const hasReasonableLength = (row.identificationNo?.length || 0) >= 3 ||
- (row.no?.length || 0) >= 1;
-
- return hasBasicData && hasReasonableLength;
-}
+/* -------------------------------------------------------------------------- */
+/* OCR 품질 분석 (기존 로직 유지) */
+/* -------------------------------------------------------------------------- */
-// 행 신뢰도 계산
-function calculateRowConfidence(row: ExtractedRow): number {
- let score = 0;
- let maxScore = 0;
-
- // 각 필드별 가중치
- const weights = {
- no: 1,
- identificationNo: 3, // 가장 중요
- tagNo: 2,
- jointNo: 2,
- jointType: 1,
- weldingDate: 1
- };
+export function analyzeOCRQuality (ocrResult: any) {
+ let conf = 0, cnt = 0, tbl = 0, kw = 0;
+ const keys = ['no.', 'identification', 'joint', 'tag', 'type', 'weld', 'date'];
- Object.entries(weights).forEach(([field, weight]) => {
- maxScore += weight;
- const value = row[field as keyof ExtractedRow] as string;
-
- if (value && value.length > 0) {
- // 기본 점수
- score += weight * 0.5;
-
- // 길이 보너스
- if (field === 'identificationNo' && value.length > 10) {
- score += weight * 0.3;
- } else if (field === 'no' && /^\d+$/.test(value)) {
- score += weight * 0.3;
- } else if (field === 'weldingDate' && /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/.test(value)) {
- score += weight * 0.3;
- } else if (value.length > 2) {
- score += weight * 0.2;
- }
- }
+ ocrResult.images?.forEach((img: any) => {
+ tbl += img.tables?.length || 0;
+ img.fields?.forEach((f: any) => {
+ conf += f.inferConfidence || 0; cnt++;
+ const t = (f.inferText || '').toLowerCase();
+ keys.forEach(k => { if (t.includes(k)) kw++; });
+ });
});
- return maxScore > 0 ? Math.min(score / maxScore, 1) : 0;
+ return { confidence: cnt ? conf / cnt : 0, tablesFound: tbl, textQuality: cnt ? kw / cnt : 0, keywordCount: kw };
}
-
-// 유틸리티: OCR 결과 품질 분석
-export function analyzeOCRQuality(ocrResult: any): {
- confidence: number;
- tablesFound: number;
- textQuality: number;
- keywordCount: number;
-} {
- let totalConfidence = 0;
- let totalFields = 0;
- let tablesFound = 0;
- let relevantKeywords = 0;
-
- const keywords = ['no.', 'identification', 'joint', 'tag', 'type', 'weld', 'date'];
-
- if (ocrResult.images) {
- ocrResult.images.forEach((image: any) => {
- // 테이블 분석
- if (image.tables) {
- tablesFound += image.tables.length;
- }
-
- // 필드 신뢰도 분석
- if (image.fields) {
- image.fields.forEach((field: any) => {
- const confidence = field.inferConfidence || 0;
- const text = (field.inferText || '').toLowerCase();
-
- totalConfidence += confidence;
- totalFields++;
-
- // 관련 키워드 확인
- keywords.forEach(keyword => {
- if (text.includes(keyword)) {
- relevantKeywords++;
- }
- });
- });
- }
- });
- }
-
- const avgConfidence = totalFields > 0 ? totalConfidence / totalFields : 0;
- const textQuality = totalFields > 0 ? relevantKeywords / totalFields : 0;
-
- return {
- confidence: avgConfidence,
- tablesFound,
- textQuality,
- keywordCount: relevantKeywords
- };
-} \ No newline at end of file