// app/api/ocr/utils/tableExtraction.ts // 완전한 테이블 추출 로직 구현 interface ExtractedRow { no: string; identificationNo: string; tagNo: string; jointNo: string; jointType: string; weldingDate: string; confidence: number; sourceTable: number; sourceRow: number; } interface TableCell { cellTextLines: Array<{ cellWords: Array<{ inferText: string; inferConfidence: number; }>; }>; rowIndex: number; columnIndex: number; rowSpan: number; columnSpan: number; inferConfidence: number; } interface OCRTable { cells: TableCell[]; inferConfidence: number; } interface ColumnMapping { no: number; identification: number; tagNo: number; jointNo: number; jointType: number; weldingDate: number; } // 메인 테이블 추출 함수 export async function extractTablesFromOCR(ocrResult: any): Promise { const extractedTables: ExtractedRow[][] = []; const warnings: string[] = []; if (!ocrResult || !ocrResult.images) { console.warn('No OCR images found in result'); return []; } for (let imageIndex = 0; imageIndex < ocrResult.images.length; imageIndex++) { const image = ocrResult.images[imageIndex]; if (!image.tables || image.tables.length === 0) { console.warn(`No tables found in image ${imageIndex}`); continue; } for (let tableIndex = 0; tableIndex < image.tables.length; tableIndex++) { const table = image.tables[tableIndex]; try { if (isRelevantTable(table)) { const extractedRows = extractTableData(table, imageIndex, tableIndex); if (extractedRows.length > 0) { extractedTables.push(extractedRows); console.log(`Successfully extracted ${extractedRows.length} rows from table ${tableIndex + 1} in image ${imageIndex + 1}`); } else { console.warn(`Table ${tableIndex + 1} in image ${imageIndex + 1} was identified as relevant but no data could be extracted`); } } else { console.log(`Table ${tableIndex + 1} in image ${imageIndex + 1} is not relevant (no required headers found)`); } } catch (error) { console.error(`Error processing table ${tableIndex + 1} in image ${imageIndex + 1}:`, error); } } } console.log(`Total extracted tables: ${extractedTables.length}`); return extractedTables; } // 관련 테이블인지 확인 function isRelevantTable(table: OCRTable): boolean { if (!table.cells || table.cells.length === 0) { return false; } // 첫 3행에서 헤더 찾기 const headerCells = table.cells.filter(cell => cell.rowIndex <= 2); const headerTexts = headerCells .map(cell => getCellText(cell).toLowerCase()) .filter(text => text.length > 0); console.log('Header texts found:', headerTexts); // 필수 키워드 확인 const hasNo = headerTexts.some(text => text.includes('no.') || text === 'no' || text.includes('번호') || text.match(/^no\.?$/i) ); const hasIdentification = headerTexts.some(text => text.includes('identification') || text.includes('식별') || text.includes('ident') || text.includes('id') ); // 테이블 품질 확인 const hasMinimumCells = table.cells.length >= 6; // 최소 헤더 + 데이터 const hasReasonableConfidence = table.inferConfidence >= 0.5; // 신뢰도 기준 완화 const isRelevant = hasNo && hasIdentification && hasMinimumCells && hasReasonableConfidence; console.log(`Table relevance check: hasNo=${hasNo}, hasIdentification=${hasIdentification}, minCells=${hasMinimumCells}, confidence=${hasReasonableConfidence} => ${isRelevant}`); return isRelevant; } // 테이블 데이터 추출 function extractTableData(table: OCRTable, imageIndex: number, tableIndex: number): ExtractedRow[] { console.log(`Processing table ${tableIndex + 1} in image ${imageIndex + 1}`); // 테이블 그리드 구축 const tableGrid = buildTableGrid(table); if (tableGrid.length < 2) { console.warn('Table has less than 2 rows (need header + data)'); return []; } console.log(`Table grid built: ${tableGrid.length} rows, ${tableGrid[0]?.length || 0} columns`); // 헤더 행 찾기 const headerRowIndex = findHeaderRow(tableGrid); if (headerRowIndex === -1) { console.warn('No header row found'); return []; } console.log(`Header row found at index: ${headerRowIndex}`); // 테이블 형식 결정 const headerRow = tableGrid[headerRowIndex]; const tableFormat = determineTableFormat(headerRow); console.log(`Table format detected: ${tableFormat}`); // 컬럼 매핑 찾기 const columnMapping = findColumnMapping(headerRow, tableFormat); console.log('Column mapping:', columnMapping); // 데이터 행 추출 const dataRows: ExtractedRow[] = []; for (let i = headerRowIndex + 1; i < tableGrid.length; i++) { const row = tableGrid[i]; if (row && row.length > 0 && !isEmptyRow(row)) { try { const extractedRow = extractRowData(row, tableFormat, columnMapping, imageIndex, tableIndex, i); if (extractedRow && isValidRow(extractedRow)) { dataRows.push(extractedRow); } } catch (error) { console.warn(`Error processing row ${i}:`, error); } } } console.log(`Extracted ${dataRows.length} valid rows from table`); return dataRows; } // 테이블 그리드 구축 function buildTableGrid(table: OCRTable): string[][] { if (!table.cells || table.cells.length === 0) { return []; } const maxRow = Math.max(...table.cells.map(cell => cell.rowIndex + cell.rowSpan - 1)) + 1; const maxCol = Math.max(...table.cells.map(cell => cell.columnIndex + cell.columnSpan - 1)) + 1; const grid: string[][] = Array(maxRow).fill(null).map(() => Array(maxCol).fill('')); // 셀 내용으로 그리드 채우기 table.cells.forEach(cell => { const text = getCellText(cell); for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) { for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) { if (grid[r] && grid[r][c] !== undefined) { // 기존 텍스트가 있으면 결합 grid[r][c] = grid[r][c] ? `${grid[r][c]} ${text}`.trim() : text; } } } }); return grid; } // 셀 텍스트 추출 function getCellText(cell: TableCell): string { if (!cell.cellTextLines || cell.cellTextLines.length === 0) { return ''; } return cell.cellTextLines .map(line => line.cellWords .map(word => word.inferText || '') .join(' ') ) .join('\n') .trim(); } // 헤더 행 찾기 function findHeaderRow(tableGrid: string[][]): number { for (let i = 0; i < Math.min(3, tableGrid.length); i++) { const row = tableGrid[i]; const rowText = row.join(' ').toLowerCase(); console.log(`Checking row ${i}: "${rowText}"`); const hasNo = rowText.includes('no.') || rowText.includes('번호') || /\bno\b/.test(rowText); const hasIdent = rowText.includes('identification') || rowText.includes('식별') || rowText.includes('ident'); if (hasNo && hasIdent) { console.log(`Header row found at ${i}`); return i; } } return -1; } // 테이블 형식 결정 function determineTableFormat(headerRow: string[]): 'format1' | 'format2' { const headerText = headerRow.join(' ').toLowerCase(); // Format 2: Tag No와 Joint No가 분리된 컬럼 const hasTagNoColumn = headerText.includes('tag') && headerText.includes('no'); const hasJointNoColumn = headerText.includes('joint') && headerText.includes('no'); if (hasTagNoColumn && hasJointNoColumn) { return 'format2'; } // Format 1: Identification No에 통합 return 'format1'; } // 컬럼 매핑 찾기 function findColumnMapping(headerRow: string[], format: 'format1' | 'format2'): ColumnMapping { const mapping: ColumnMapping = { no: -1, identification: -1, tagNo: -1, jointNo: -1, jointType: -1, weldingDate: -1 }; headerRow.forEach((header, index) => { const lowerHeader = header.toLowerCase().trim(); console.log(`Column ${index}: "${header}" -> "${lowerHeader}"`); if ((lowerHeader.includes('no.') || lowerHeader === 'no') && !lowerHeader.includes('identification') && !lowerHeader.includes('tag') && !lowerHeader.includes('joint')) { mapping.no = index; console.log(` -> Mapped to 'no'`); } else if (lowerHeader.includes('identification') || lowerHeader.includes('ident')) { mapping.identification = index; console.log(` -> Mapped to 'identification'`); } else if (lowerHeader.includes('tag') && lowerHeader.includes('no')) { mapping.tagNo = index; console.log(` -> Mapped to 'tagNo'`); } else if (lowerHeader.includes('joint') && lowerHeader.includes('no')) { mapping.jointNo = index; console.log(` -> Mapped to 'jointNo'`); } else if (lowerHeader.includes('joint') && lowerHeader.includes('type')) { mapping.jointType = index; console.log(` -> Mapped to 'jointType'`); } else if (lowerHeader.includes('type') && !lowerHeader.includes('joint')) { mapping.jointType = index; console.log(` -> Mapped to 'jointType'`); } else if (lowerHeader.includes('welding') || lowerHeader.includes('date')) { mapping.weldingDate = index; console.log(` -> Mapped to 'weldingDate'`); } }); console.log('Final column mapping:', mapping); return mapping; } // 행 데이터 추출 function extractRowData( row: string[], format: 'format1' | 'format2', columnMapping: ColumnMapping, imageIndex: number, tableIndex: number, rowIndex: number ): ExtractedRow | null { const extractedRow: ExtractedRow = { no: '', identificationNo: '', tagNo: '', jointNo: '', jointType: '', weldingDate: '', confidence: 0, sourceTable: tableIndex, sourceRow: rowIndex }; console.log(`Processing row ${rowIndex}: [${row.map(cell => `"${cell}"`).join(', ')}]`); // No. 추출 if (columnMapping.no >= 0 && columnMapping.no < row.length) { extractedRow.no = cleanText(row[columnMapping.no]); } if (format === 'format1') { // Format 1: 통합된 identification 데이터 if (columnMapping.identification >= 0 && columnMapping.identification < row.length) { const combinedText = row[columnMapping.identification]; const parsedData = parseIdentificationData(combinedText); extractedRow.identificationNo = parsedData.identificationNo; extractedRow.tagNo = parsedData.tagNo; extractedRow.jointNo = parsedData.jointNo; console.log(` Parsed identification: "${combinedText}" -> `, parsedData); } } else { // Format 2: 분리된 컬럼들 if (columnMapping.identification >= 0 && columnMapping.identification < row.length) { extractedRow.identificationNo = cleanText(row[columnMapping.identification]); } if (columnMapping.tagNo >= 0 && columnMapping.tagNo < row.length) { extractedRow.tagNo = cleanText(row[columnMapping.tagNo]); } if (columnMapping.jointNo >= 0 && columnMapping.jointNo < row.length) { extractedRow.jointNo = cleanText(row[columnMapping.jointNo]); } } // Joint Type 추출 if (columnMapping.jointType >= 0 && columnMapping.jointType < row.length) { extractedRow.jointType = cleanText(row[columnMapping.jointType]); } // Welding Date 추출 (컬럼 매핑이 있으면 사용, 없으면 날짜 패턴으로 찾기) if (columnMapping.weldingDate >= 0 && columnMapping.weldingDate < row.length) { extractedRow.weldingDate = cleanText(row[columnMapping.weldingDate]); } else { const dateIndex = findDateColumn(row); if (dateIndex >= 0) { extractedRow.weldingDate = cleanText(row[dateIndex]); } } // 신뢰도 계산 extractedRow.confidence = calculateRowConfidence(extractedRow); console.log(` Extracted row:`, extractedRow); return extractedRow; } // Identification 데이터 파싱 (Format 1용) function parseIdentificationData(combinedText: string): { identificationNo: string; tagNo: string; jointNo: string; } { const cleanedText = cleanText(combinedText); console.log(`Parsing identification data: "${cleanedText}"`); // 줄바꿈으로 먼저 분리 const lines = cleanedText.split(/[\r\n]+/).map(line => line.trim()).filter(line => line.length > 0); const allParts: string[] = []; lines.forEach(line => { // 공백과 특수문자로 분리 const parts = line.split(/[\s\-_]+/).filter(part => part.length > 0); allParts.push(...parts); }); console.log(` Split into parts:`, allParts); if (allParts.length === 0) { return { identificationNo: cleanedText, tagNo: '', jointNo: '' }; } if (allParts.length === 1) { return { identificationNo: allParts[0], tagNo: '', jointNo: '' }; } // 길이별로 정렬하여 식별 const sortedParts = [...allParts].sort((a, b) => b.length - a.length); const identificationNo = sortedParts[0]; // 가장 긴 것 const jointNo = allParts.find(part => part.length <= 3 && /^[A-Z0-9]+$/i.test(part)) || sortedParts[sortedParts.length - 1]; // 3글자 이하 영숫자 또는 가장 짧은 것 const tagNo = allParts.find(part => part !== identificationNo && part !== jointNo) || ''; const result = { identificationNo, tagNo, jointNo }; console.log(` Parsed result:`, result); return result; } // 날짜 컬럼 찾기 function findDateColumn(row: string[]): number { const datePattern = /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/; for (let i = 0; i < row.length; i++) { if (datePattern.test(row[i])) { console.log(` Found date in column ${i}: "${row[i]}"`); return i; } } return -1; } // 텍스트 정리 function cleanText(text: string): string { return text .replace(/[\r\n\t]+/g, ' ') .replace(/\s+/g, ' ') .trim(); } // 빈 행 확인 function isEmptyRow(row: string[]): boolean { return row.every(cell => !cell || cell.trim().length === 0); } // 유효한 행 확인 function isValidRow(row: ExtractedRow): boolean { // 번호나 식별번호 중 하나라도 있으면 유효 const hasBasicData = !!(row.no || row.identificationNo); // 너무 짧은 데이터는 제외 (오인식 방지) const hasReasonableLength = (row.identificationNo?.length || 0) >= 3 || (row.no?.length || 0) >= 1; return hasBasicData && hasReasonableLength; } // 행 신뢰도 계산 function calculateRowConfidence(row: ExtractedRow): number { let score = 0; let maxScore = 0; // 각 필드별 가중치 const weights = { no: 1, identificationNo: 3, // 가장 중요 tagNo: 2, jointNo: 2, jointType: 1, weldingDate: 1 }; Object.entries(weights).forEach(([field, weight]) => { maxScore += weight; const value = row[field as keyof ExtractedRow] as string; if (value && value.length > 0) { // 기본 점수 score += weight * 0.5; // 길이 보너스 if (field === 'identificationNo' && value.length > 10) { score += weight * 0.3; } else if (field === 'no' && /^\d+$/.test(value)) { score += weight * 0.3; } else if (field === 'weldingDate' && /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/.test(value)) { score += weight * 0.3; } else if (value.length > 2) { score += weight * 0.2; } } }); return maxScore > 0 ? Math.min(score / maxScore, 1) : 0; } // 유틸리티: OCR 결과 품질 분석 export function analyzeOCRQuality(ocrResult: any): { confidence: number; tablesFound: number; textQuality: number; keywordCount: number; } { let totalConfidence = 0; let totalFields = 0; let tablesFound = 0; let relevantKeywords = 0; const keywords = ['no.', 'identification', 'joint', 'tag', 'type', 'weld', 'date']; if (ocrResult.images) { ocrResult.images.forEach((image: any) => { // 테이블 분석 if (image.tables) { tablesFound += image.tables.length; } // 필드 신뢰도 분석 if (image.fields) { image.fields.forEach((field: any) => { const confidence = field.inferConfidence || 0; const text = (field.inferText || '').toLowerCase(); totalConfidence += confidence; totalFields++; // 관련 키워드 확인 keywords.forEach(keyword => { if (text.includes(keyword)) { relevantKeywords++; } }); }); } }); } const avgConfidence = totalFields > 0 ? totalConfidence / totalFields : 0; const textQuality = totalFields > 0 ? relevantKeywords / totalFields : 0; return { confidence: avgConfidence, tablesFound, textQuality, keywordCount: relevantKeywords }; }