// app/api/ocr/utils/tableExtraction.ts // 개선된 완전 테이블 추출 로직 – Format‑1 식별번호 파싱 보강 & 중복 행 제거 /* -------------------------------------------------------------------------- */ /* 타입 */ /* -------------------------------------------------------------------------- */ export interface ExtractedRow { no: string; identificationNo: string; tagNo: string; jointNo: string; jointType: string; weldingDate: string; confidence: number; sourceTable: number; sourceRow: number; } interface TableCell { cellTextLines: Array<{ cellWords: Array<{ inferText: string; inferConfidence: number; }>; }>; rowIndex: number; columnIndex: number; rowSpan: number; columnSpan: number; inferConfidence: number; } interface OCRTable { cells: TableCell[]; inferConfidence: number; } interface ColumnMapping { no: number; identification: number; tagNo: number; jointNo: number; jointType: number; weldingDate: number; } /* -------------------------------------------------------------------------- */ /* 메인 */ /* -------------------------------------------------------------------------- */ export async function extractTablesFromOCR (ocrResult: any): Promise { const tables: ExtractedRow[][] = []; if (!ocrResult?.images) return tables; ocrResult.images.forEach((image: any, imgIdx: number) => { image.tables?.forEach((table: OCRTable, tblIdx: number) => { if (!isRelevantTable(table)) return; const rows = extractTableData(table, imgIdx, tblIdx); if (rows.length) tables.push(rows); }); }); return tables; } /* -------------------------------------------------------------------------- */ /* 관련 테이블 판별 */ /* -------------------------------------------------------------------------- */ function isRelevantTable (table: OCRTable): boolean { const headers = table.cells.filter(c => c.rowIndex < 3).map(getCellText).join(' ').toLowerCase(); return /\bno\b|번호/.test(headers) && /identification|식별|ident|id/.test(headers); } /* -------------------------------------------------------------------------- */ /* 표 해석 */ /* -------------------------------------------------------------------------- */ function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): ExtractedRow[] { const grid = buildGrid(table); const headerRowIdx = findHeaderRow(grid); if (headerRowIdx === -1) return []; const format = detectFormat(grid[headerRowIdx]); const mapping = mapColumns(grid[headerRowIdx]); const seen = new Set(); const data: ExtractedRow[] = []; for (let r = headerRowIdx + 1; r < grid.length; r++) { const row = grid[r]; if (isBlankRow(row)) continue; const parsed = buildRow(row, format, mapping, tblIdx, r); if (!parsed || !isValidRow(parsed)) continue; const key = `${parsed.no}-${parsed.identificationNo}`; if (seen.has(key)) continue; seen.add(key); data.push(parsed); } return data; } /* -------------------------------------------------------------------------- */ /* Grid & Header */ /* -------------------------------------------------------------------------- */ function buildGrid (table: OCRTable): string[][] { const maxR = Math.max(...table.cells.map(c => c.rowIndex + c.rowSpan - 1)); const maxC = Math.max(...table.cells.map(c => c.columnIndex + c.columnSpan - 1)); const grid = Array.from({ length: maxR + 1 }, () => Array(maxC + 1).fill('')); table.cells.forEach(cell => { const txt = getCellText(cell); for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) { for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) { grid[r][c] = grid[r][c] ? `${grid[r][c]} ${txt}` : txt; } } }); return grid; } function getCellText (cell: TableCell): string { return cell.cellTextLines?.flatMap(l => l.cellWords.map(w => w.inferText.trim())).filter(Boolean).join(' ') ?? ''; } function findHeaderRow (grid: string[][]): number { for (let i = 0; i < Math.min(3, grid.length); i++) { const t = grid[i].join(' ').toLowerCase(); if (/\bno\b|번호/.test(t) && /identification|식별|ident/.test(t)) return i; } return -1; } /* -------------------------------------------------------------------------- */ /* Column Mapping */ /* -------------------------------------------------------------------------- */ function detectFormat (header: string[]): 'format1' | 'format2' { const h = header.join(' ').toLowerCase(); return h.includes('tag') && h.includes('joint') ? 'format2' : 'format1'; } function mapColumns (header: string[]): ColumnMapping { const mp: ColumnMapping = { no: -1, identification: -1, tagNo: -1, jointNo: -1, jointType: -1, weldingDate: -1 }; header.forEach((h, i) => { const t = h.toLowerCase(); if (/^no\.?$/.test(t) && !/ident|tag|joint/.test(t)) mp.no = i; else if (/identification|ident/.test(t)) mp.identification = i; else if (/tag.*no/.test(t)) mp.tagNo = i; else if (/joint.*no/.test(t)) mp.jointNo = i; else if (/joint.*type/.test(t) || (/^type$/.test(t) && mp.jointType === -1)) mp.jointType = i; else if (/welding|date/.test(t)) mp.weldingDate = i; }); return mp; } /* -------------------------------------------------------------------------- */ /* Row Extraction */ /* -------------------------------------------------------------------------- */ function buildRow ( row: string[], format: 'format1' | 'format2', mp: ColumnMapping, tblIdx: number, rowIdx: number ): ExtractedRow | null { const out: ExtractedRow = { no: mp.no >= 0 ? clean(row[mp.no]) : '', identificationNo: '', tagNo: '', jointNo: '', jointType: mp.jointType >= 0 ? clean(row[mp.jointType]) : '', weldingDate: '', confidence: 0, sourceTable: tblIdx, sourceRow: rowIdx, }; if (mp.weldingDate >= 0) out.weldingDate = clean(row[mp.weldingDate]); else { const idx = row.findIndex(col => /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/.test(col)); if (idx >= 0) out.weldingDate = clean(row[idx]); } if (format === 'format2') { if (mp.identification >= 0) out.identificationNo = clean(row[mp.identification]); if (mp.jointNo >= 0) out.jointNo = clean(row[mp.jointNo]); if (mp.tagNo >= 0) out.tagNo = clean(row[mp.tagNo]); } else { const combined = mp.identification >= 0 ? row[mp.identification] : ''; const parsed = parseIdentificationData(combined); out.identificationNo = parsed.identificationNo; out.jointNo = parsed.jointNo; out.tagNo = parsed.tagNo; } out.confidence = scoreRow(out); return out; } /* -------------------------------------------------------------------------- */ /* Format‑1 셀 파싱 */ /* -------------------------------------------------------------------------- */ function parseIdentificationData (txt: string): { identificationNo: string; jointNo: string; tagNo: string } { const cleaned = clean(txt); if (!cleaned) return { identificationNo: '', jointNo: '', tagNo: '' }; const tokens = cleaned.split(/\s+/).map(clean).filter(Boolean); // Identification 후보: 하이픈이 2개 이상 포함된 토큰 가운데 가장 긴 것 const idCand = tokens.filter(t => t.split('-').length >= 3).sort((a, b) => b.length - a.length); const identificationNo = idCand[0] || ''; const residual = tokens.filter(t => t !== identificationNo); if (!residual.length) return { identificationNo, jointNo: '', tagNo: '' }; residual.sort((a, b) => a.length - b.length); const jointNo = residual[0] || ''; const tagNo = residual[residual.length - 1] || ''; return { identificationNo, jointNo, tagNo }; } /* -------------------------------------------------------------------------- */ /* Helpers */ /* -------------------------------------------------------------------------- */ const clean = (s: string = '') => s.replace(/[\r\n\t]+/g, ' ').replace(/\s+/g, ' ').trim(); const isBlankRow = (row: string[]) => row.every(c => !clean(c)); const isValidRow = (r: ExtractedRow) => !!(r.no || r.identificationNo); function scoreRow (r: ExtractedRow): number { const w: Record = { no: 1, identificationNo: 3, tagNo: 2, jointNo: 2, jointType: 1, weldingDate: 1, confidence: 0, sourceTable: 0, sourceRow: 0, } as any; let s = 0, t = 0; (Object.keys(w) as (keyof ExtractedRow)[]).forEach(k => { t += w[k]; if ((r[k] as string)?.length) s += w[k]; }); return t ? s / t : 0; } /* -------------------------------------------------------------------------- */ /* OCR 품질 분석 (기존 로직 유지) */ /* -------------------------------------------------------------------------- */ export function analyzeOCRQuality (ocrResult: any) { let conf = 0, cnt = 0, tbl = 0, kw = 0; const keys = ['no.', 'identification', 'joint', 'tag', 'type', 'weld', 'date']; ocrResult.images?.forEach((img: any) => { tbl += img.tables?.length || 0; img.fields?.forEach((f: any) => { conf += f.inferConfidence || 0; cnt++; const t = (f.inferText || '').toLowerCase(); keys.forEach(k => { if (t.includes(k)) kw++; }); }); }); return { confidence: cnt ? conf / cnt : 0, tablesFound: tbl, textQuality: cnt ? kw / cnt : 0, keywordCount: kw }; }