diff options
Diffstat (limited to 'app/api/ocr/utils/tableExtraction.ts')
| -rw-r--r-- | app/api/ocr/utils/tableExtraction.ts | 556 |
1 files changed, 556 insertions, 0 deletions
diff --git a/app/api/ocr/utils/tableExtraction.ts b/app/api/ocr/utils/tableExtraction.ts new file mode 100644 index 00000000..ea543f8e --- /dev/null +++ b/app/api/ocr/utils/tableExtraction.ts @@ -0,0 +1,556 @@ +// app/api/ocr/utils/tableExtraction.ts +// 완전한 테이블 추출 로직 구현 + +interface ExtractedRow { + no: string; + identificationNo: string; + tagNo: string; + jointNo: string; + jointType: string; + weldingDate: string; + confidence: number; + sourceTable: number; + sourceRow: number; +} + +interface TableCell { + cellTextLines: Array<{ + cellWords: Array<{ + inferText: string; + inferConfidence: number; + }>; + }>; + rowIndex: number; + columnIndex: number; + rowSpan: number; + columnSpan: number; + inferConfidence: number; +} + +interface OCRTable { + cells: TableCell[]; + inferConfidence: number; +} + +interface ColumnMapping { + no: number; + identification: number; + tagNo: number; + jointNo: number; + jointType: number; + weldingDate: number; +} + +// 메인 테이블 추출 함수 +export async function extractTablesFromOCR(ocrResult: any): Promise<ExtractedRow[][]> { + const extractedTables: ExtractedRow[][] = []; + const warnings: string[] = []; + + if (!ocrResult || !ocrResult.images) { + console.warn('No OCR images found in result'); + return []; + } + + for (let imageIndex = 0; imageIndex < ocrResult.images.length; imageIndex++) { + const image = ocrResult.images[imageIndex]; + + if (!image.tables || image.tables.length === 0) { + console.warn(`No tables found in image ${imageIndex}`); + continue; + } + + for (let tableIndex = 0; tableIndex < image.tables.length; tableIndex++) { + const table = image.tables[tableIndex]; + + try { + if (isRelevantTable(table)) { + const extractedRows = extractTableData(table, imageIndex, tableIndex); + + if (extractedRows.length > 0) { + extractedTables.push(extractedRows); + console.log(`Successfully extracted ${extractedRows.length} rows from table ${tableIndex + 1} in image ${imageIndex + 1}`); + } else { + console.warn(`Table ${tableIndex + 1} in image ${imageIndex + 1} was identified as relevant but no data could be extracted`); + } + } else { + console.log(`Table ${tableIndex + 1} in image ${imageIndex + 1} is not relevant (no required headers found)`); + } + } catch (error) { + console.error(`Error processing table ${tableIndex + 1} in image ${imageIndex + 1}:`, error); + } + } + } + + console.log(`Total extracted tables: ${extractedTables.length}`); + return extractedTables; +} + +// 관련 테이블인지 확인 +function isRelevantTable(table: OCRTable): boolean { + if (!table.cells || table.cells.length === 0) { + return false; + } + + // 첫 3행에서 헤더 찾기 + const headerCells = table.cells.filter(cell => cell.rowIndex <= 2); + const headerTexts = headerCells + .map(cell => getCellText(cell).toLowerCase()) + .filter(text => text.length > 0); + + console.log('Header texts found:', headerTexts); + + // 필수 키워드 확인 + const hasNo = headerTexts.some(text => + text.includes('no.') || + text === 'no' || + text.includes('번호') || + text.match(/^no\.?$/i) + ); + + const hasIdentification = headerTexts.some(text => + text.includes('identification') || + text.includes('식별') || + text.includes('ident') || + text.includes('id') + ); + + // 테이블 품질 확인 + const hasMinimumCells = table.cells.length >= 6; // 최소 헤더 + 데이터 + const hasReasonableConfidence = table.inferConfidence >= 0.5; // 신뢰도 기준 완화 + + const isRelevant = hasNo && hasIdentification && hasMinimumCells && hasReasonableConfidence; + + console.log(`Table relevance check: hasNo=${hasNo}, hasIdentification=${hasIdentification}, minCells=${hasMinimumCells}, confidence=${hasReasonableConfidence} => ${isRelevant}`); + + return isRelevant; +} + +// 테이블 데이터 추출 +function extractTableData(table: OCRTable, imageIndex: number, tableIndex: number): ExtractedRow[] { + console.log(`Processing table ${tableIndex + 1} in image ${imageIndex + 1}`); + + // 테이블 그리드 구축 + const tableGrid = buildTableGrid(table); + + if (tableGrid.length < 2) { + console.warn('Table has less than 2 rows (need header + data)'); + return []; + } + + console.log(`Table grid built: ${tableGrid.length} rows, ${tableGrid[0]?.length || 0} columns`); + + // 헤더 행 찾기 + const headerRowIndex = findHeaderRow(tableGrid); + if (headerRowIndex === -1) { + console.warn('No header row found'); + return []; + } + + console.log(`Header row found at index: ${headerRowIndex}`); + + // 테이블 형식 결정 + const headerRow = tableGrid[headerRowIndex]; + const tableFormat = determineTableFormat(headerRow); + console.log(`Table format detected: ${tableFormat}`); + + // 컬럼 매핑 찾기 + const columnMapping = findColumnMapping(headerRow, tableFormat); + console.log('Column mapping:', columnMapping); + + // 데이터 행 추출 + const dataRows: ExtractedRow[] = []; + + for (let i = headerRowIndex + 1; i < tableGrid.length; i++) { + const row = tableGrid[i]; + + if (row && row.length > 0 && !isEmptyRow(row)) { + try { + const extractedRow = extractRowData(row, tableFormat, columnMapping, imageIndex, tableIndex, i); + if (extractedRow && isValidRow(extractedRow)) { + dataRows.push(extractedRow); + } + } catch (error) { + console.warn(`Error processing row ${i}:`, error); + } + } + } + + console.log(`Extracted ${dataRows.length} valid rows from table`); + return dataRows; +} + +// 테이블 그리드 구축 +function buildTableGrid(table: OCRTable): string[][] { + if (!table.cells || table.cells.length === 0) { + return []; + } + + const maxRow = Math.max(...table.cells.map(cell => cell.rowIndex + cell.rowSpan - 1)) + 1; + const maxCol = Math.max(...table.cells.map(cell => cell.columnIndex + cell.columnSpan - 1)) + 1; + + const grid: string[][] = Array(maxRow).fill(null).map(() => Array(maxCol).fill('')); + + // 셀 내용으로 그리드 채우기 + table.cells.forEach(cell => { + const text = getCellText(cell); + + for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) { + for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) { + if (grid[r] && grid[r][c] !== undefined) { + // 기존 텍스트가 있으면 결합 + grid[r][c] = grid[r][c] ? `${grid[r][c]} ${text}`.trim() : text; + } + } + } + }); + + return grid; +} + +// 셀 텍스트 추출 +function getCellText(cell: TableCell): string { + if (!cell.cellTextLines || cell.cellTextLines.length === 0) { + return ''; + } + + return cell.cellTextLines + .map(line => + line.cellWords + .map(word => word.inferText || '') + .join(' ') + ) + .join('\n') + .trim(); +} + +// 헤더 행 찾기 +function findHeaderRow(tableGrid: string[][]): number { + for (let i = 0; i < Math.min(3, tableGrid.length); i++) { + const row = tableGrid[i]; + const rowText = row.join(' ').toLowerCase(); + + console.log(`Checking row ${i}: "${rowText}"`); + + const hasNo = rowText.includes('no.') || rowText.includes('번호') || /\bno\b/.test(rowText); + const hasIdent = rowText.includes('identification') || rowText.includes('식별') || rowText.includes('ident'); + + if (hasNo && hasIdent) { + console.log(`Header row found at ${i}`); + return i; + } + } + return -1; +} + +// 테이블 형식 결정 +function determineTableFormat(headerRow: string[]): 'format1' | 'format2' { + const headerText = headerRow.join(' ').toLowerCase(); + + // Format 2: Tag No와 Joint No가 분리된 컬럼 + const hasTagNoColumn = headerText.includes('tag') && headerText.includes('no'); + const hasJointNoColumn = headerText.includes('joint') && headerText.includes('no'); + + if (hasTagNoColumn && hasJointNoColumn) { + return 'format2'; + } + + // Format 1: Identification No에 통합 + return 'format1'; +} + +// 컬럼 매핑 찾기 +function findColumnMapping(headerRow: string[], format: 'format1' | 'format2'): ColumnMapping { + const mapping: ColumnMapping = { + no: -1, + identification: -1, + tagNo: -1, + jointNo: -1, + jointType: -1, + weldingDate: -1 + }; + + headerRow.forEach((header, index) => { + const lowerHeader = header.toLowerCase().trim(); + + console.log(`Column ${index}: "${header}" -> "${lowerHeader}"`); + + if ((lowerHeader.includes('no.') || lowerHeader === 'no') && + !lowerHeader.includes('identification') && + !lowerHeader.includes('tag') && + !lowerHeader.includes('joint')) { + mapping.no = index; + console.log(` -> Mapped to 'no'`); + } else if (lowerHeader.includes('identification') || lowerHeader.includes('ident')) { + mapping.identification = index; + console.log(` -> Mapped to 'identification'`); + } else if (lowerHeader.includes('tag') && lowerHeader.includes('no')) { + mapping.tagNo = index; + console.log(` -> Mapped to 'tagNo'`); + } else if (lowerHeader.includes('joint') && lowerHeader.includes('no')) { + mapping.jointNo = index; + console.log(` -> Mapped to 'jointNo'`); + } else if (lowerHeader.includes('joint') && lowerHeader.includes('type')) { + mapping.jointType = index; + console.log(` -> Mapped to 'jointType'`); + } else if (lowerHeader.includes('type') && !lowerHeader.includes('joint')) { + mapping.jointType = index; + console.log(` -> Mapped to 'jointType'`); + } else if (lowerHeader.includes('welding') || lowerHeader.includes('date')) { + mapping.weldingDate = index; + console.log(` -> Mapped to 'weldingDate'`); + } + }); + + console.log('Final column mapping:', mapping); + return mapping; +} + +// 행 데이터 추출 +function extractRowData( + row: string[], + format: 'format1' | 'format2', + columnMapping: ColumnMapping, + imageIndex: number, + tableIndex: number, + rowIndex: number +): ExtractedRow | null { + + const extractedRow: ExtractedRow = { + no: '', + identificationNo: '', + tagNo: '', + jointNo: '', + jointType: '', + weldingDate: '', + confidence: 0, + sourceTable: tableIndex, + sourceRow: rowIndex + }; + + console.log(`Processing row ${rowIndex}: [${row.map(cell => `"${cell}"`).join(', ')}]`); + + // No. 추출 + if (columnMapping.no >= 0 && columnMapping.no < row.length) { + extractedRow.no = cleanText(row[columnMapping.no]); + } + + if (format === 'format1') { + // Format 1: 통합된 identification 데이터 + if (columnMapping.identification >= 0 && columnMapping.identification < row.length) { + const combinedText = row[columnMapping.identification]; + const parsedData = parseIdentificationData(combinedText); + extractedRow.identificationNo = parsedData.identificationNo; + extractedRow.tagNo = parsedData.tagNo; + extractedRow.jointNo = parsedData.jointNo; + + console.log(` Parsed identification: "${combinedText}" -> `, parsedData); + } + } else { + // Format 2: 분리된 컬럼들 + if (columnMapping.identification >= 0 && columnMapping.identification < row.length) { + extractedRow.identificationNo = cleanText(row[columnMapping.identification]); + } + if (columnMapping.tagNo >= 0 && columnMapping.tagNo < row.length) { + extractedRow.tagNo = cleanText(row[columnMapping.tagNo]); + } + if (columnMapping.jointNo >= 0 && columnMapping.jointNo < row.length) { + extractedRow.jointNo = cleanText(row[columnMapping.jointNo]); + } + } + + // Joint Type 추출 + if (columnMapping.jointType >= 0 && columnMapping.jointType < row.length) { + extractedRow.jointType = cleanText(row[columnMapping.jointType]); + } + + // Welding Date 추출 (컬럼 매핑이 있으면 사용, 없으면 날짜 패턴으로 찾기) + if (columnMapping.weldingDate >= 0 && columnMapping.weldingDate < row.length) { + extractedRow.weldingDate = cleanText(row[columnMapping.weldingDate]); + } else { + const dateIndex = findDateColumn(row); + if (dateIndex >= 0) { + extractedRow.weldingDate = cleanText(row[dateIndex]); + } + } + + // 신뢰도 계산 + extractedRow.confidence = calculateRowConfidence(extractedRow); + + console.log(` Extracted row:`, extractedRow); + + return extractedRow; +} + +// Identification 데이터 파싱 (Format 1용) +function parseIdentificationData(combinedText: string): { + identificationNo: string; + tagNo: string; + jointNo: string; +} { + const cleanedText = cleanText(combinedText); + + console.log(`Parsing identification data: "${cleanedText}"`); + + // 줄바꿈으로 먼저 분리 + const lines = cleanedText.split(/[\r\n]+/).map(line => line.trim()).filter(line => line.length > 0); + + const allParts: string[] = []; + lines.forEach(line => { + // 공백과 특수문자로 분리 + const parts = line.split(/[\s\-_]+/).filter(part => part.length > 0); + allParts.push(...parts); + }); + + console.log(` Split into parts:`, allParts); + + if (allParts.length === 0) { + return { identificationNo: cleanedText, tagNo: '', jointNo: '' }; + } + + if (allParts.length === 1) { + return { identificationNo: allParts[0], tagNo: '', jointNo: '' }; + } + + // 길이별로 정렬하여 식별 + const sortedParts = [...allParts].sort((a, b) => b.length - a.length); + + const identificationNo = sortedParts[0]; // 가장 긴 것 + const jointNo = allParts.find(part => part.length <= 3 && /^[A-Z0-9]+$/i.test(part)) || + sortedParts[sortedParts.length - 1]; // 3글자 이하 영숫자 또는 가장 짧은 것 + const tagNo = allParts.find(part => part !== identificationNo && part !== jointNo) || ''; + + const result = { identificationNo, tagNo, jointNo }; + console.log(` Parsed result:`, result); + + return result; +} + +// 날짜 컬럼 찾기 +function findDateColumn(row: string[]): number { + const datePattern = /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/; + + for (let i = 0; i < row.length; i++) { + if (datePattern.test(row[i])) { + console.log(` Found date in column ${i}: "${row[i]}"`); + return i; + } + } + + return -1; +} + +// 텍스트 정리 +function cleanText(text: string): string { + return text + .replace(/[\r\n\t]+/g, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +// 빈 행 확인 +function isEmptyRow(row: string[]): boolean { + return row.every(cell => !cell || cell.trim().length === 0); +} + +// 유효한 행 확인 +function isValidRow(row: ExtractedRow): boolean { + // 번호나 식별번호 중 하나라도 있으면 유효 + const hasBasicData = !!(row.no || row.identificationNo); + + // 너무 짧은 데이터는 제외 (오인식 방지) + const hasReasonableLength = (row.identificationNo?.length || 0) >= 3 || + (row.no?.length || 0) >= 1; + + return hasBasicData && hasReasonableLength; +} + +// 행 신뢰도 계산 +function calculateRowConfidence(row: ExtractedRow): number { + let score = 0; + let maxScore = 0; + + // 각 필드별 가중치 + const weights = { + no: 1, + identificationNo: 3, // 가장 중요 + tagNo: 2, + jointNo: 2, + jointType: 1, + weldingDate: 1 + }; + + Object.entries(weights).forEach(([field, weight]) => { + maxScore += weight; + const value = row[field as keyof ExtractedRow] as string; + + if (value && value.length > 0) { + // 기본 점수 + score += weight * 0.5; + + // 길이 보너스 + if (field === 'identificationNo' && value.length > 10) { + score += weight * 0.3; + } else if (field === 'no' && /^\d+$/.test(value)) { + score += weight * 0.3; + } else if (field === 'weldingDate' && /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/.test(value)) { + score += weight * 0.3; + } else if (value.length > 2) { + score += weight * 0.2; + } + } + }); + + return maxScore > 0 ? Math.min(score / maxScore, 1) : 0; +} + +// 유틸리티: OCR 결과 품질 분석 +export function analyzeOCRQuality(ocrResult: any): { + confidence: number; + tablesFound: number; + textQuality: number; + keywordCount: number; +} { + let totalConfidence = 0; + let totalFields = 0; + let tablesFound = 0; + let relevantKeywords = 0; + + const keywords = ['no.', 'identification', 'joint', 'tag', 'type', 'weld', 'date']; + + if (ocrResult.images) { + ocrResult.images.forEach((image: any) => { + // 테이블 분석 + if (image.tables) { + tablesFound += image.tables.length; + } + + // 필드 신뢰도 분석 + if (image.fields) { + image.fields.forEach((field: any) => { + const confidence = field.inferConfidence || 0; + const text = (field.inferText || '').toLowerCase(); + + totalConfidence += confidence; + totalFields++; + + // 관련 키워드 확인 + keywords.forEach(keyword => { + if (text.includes(keyword)) { + relevantKeywords++; + } + }); + }); + } + }); + } + + const avgConfidence = totalFields > 0 ? totalConfidence / totalFields : 0; + const textQuality = totalFields > 0 ? relevantKeywords / totalFields : 0; + + return { + confidence: avgConfidence, + tablesFound, + textQuality, + keywordCount: relevantKeywords + }; +}
\ No newline at end of file |
