summaryrefslogtreecommitdiff
path: root/app/api/ocr/utils/tableExtraction.ts
diff options
context:
space:
mode:
Diffstat (limited to 'app/api/ocr/utils/tableExtraction.ts')
-rw-r--r--app/api/ocr/utils/tableExtraction.ts556
1 files changed, 556 insertions, 0 deletions
diff --git a/app/api/ocr/utils/tableExtraction.ts b/app/api/ocr/utils/tableExtraction.ts
new file mode 100644
index 00000000..ea543f8e
--- /dev/null
+++ b/app/api/ocr/utils/tableExtraction.ts
@@ -0,0 +1,556 @@
+// app/api/ocr/utils/tableExtraction.ts
+// 완전한 테이블 추출 로직 구현
+
+interface ExtractedRow {
+ no: string;
+ identificationNo: string;
+ tagNo: string;
+ jointNo: string;
+ jointType: string;
+ weldingDate: string;
+ confidence: number;
+ sourceTable: number;
+ sourceRow: number;
+}
+
+interface TableCell {
+ cellTextLines: Array<{
+ cellWords: Array<{
+ inferText: string;
+ inferConfidence: number;
+ }>;
+ }>;
+ rowIndex: number;
+ columnIndex: number;
+ rowSpan: number;
+ columnSpan: number;
+ inferConfidence: number;
+}
+
+interface OCRTable {
+ cells: TableCell[];
+ inferConfidence: number;
+}
+
+interface ColumnMapping {
+ no: number;
+ identification: number;
+ tagNo: number;
+ jointNo: number;
+ jointType: number;
+ weldingDate: number;
+}
+
+// 메인 테이블 추출 함수
+export async function extractTablesFromOCR(ocrResult: any): Promise<ExtractedRow[][]> {
+ const extractedTables: ExtractedRow[][] = [];
+ const warnings: string[] = [];
+
+ if (!ocrResult || !ocrResult.images) {
+ console.warn('No OCR images found in result');
+ return [];
+ }
+
+ for (let imageIndex = 0; imageIndex < ocrResult.images.length; imageIndex++) {
+ const image = ocrResult.images[imageIndex];
+
+ if (!image.tables || image.tables.length === 0) {
+ console.warn(`No tables found in image ${imageIndex}`);
+ continue;
+ }
+
+ for (let tableIndex = 0; tableIndex < image.tables.length; tableIndex++) {
+ const table = image.tables[tableIndex];
+
+ try {
+ if (isRelevantTable(table)) {
+ const extractedRows = extractTableData(table, imageIndex, tableIndex);
+
+ if (extractedRows.length > 0) {
+ extractedTables.push(extractedRows);
+ console.log(`Successfully extracted ${extractedRows.length} rows from table ${tableIndex + 1} in image ${imageIndex + 1}`);
+ } else {
+ console.warn(`Table ${tableIndex + 1} in image ${imageIndex + 1} was identified as relevant but no data could be extracted`);
+ }
+ } else {
+ console.log(`Table ${tableIndex + 1} in image ${imageIndex + 1} is not relevant (no required headers found)`);
+ }
+ } catch (error) {
+ console.error(`Error processing table ${tableIndex + 1} in image ${imageIndex + 1}:`, error);
+ }
+ }
+ }
+
+ console.log(`Total extracted tables: ${extractedTables.length}`);
+ return extractedTables;
+}
+
+// 관련 테이블인지 확인
+function isRelevantTable(table: OCRTable): boolean {
+ if (!table.cells || table.cells.length === 0) {
+ return false;
+ }
+
+ // 첫 3행에서 헤더 찾기
+ const headerCells = table.cells.filter(cell => cell.rowIndex <= 2);
+ const headerTexts = headerCells
+ .map(cell => getCellText(cell).toLowerCase())
+ .filter(text => text.length > 0);
+
+ console.log('Header texts found:', headerTexts);
+
+ // 필수 키워드 확인
+ const hasNo = headerTexts.some(text =>
+ text.includes('no.') ||
+ text === 'no' ||
+ text.includes('번호') ||
+ text.match(/^no\.?$/i)
+ );
+
+ const hasIdentification = headerTexts.some(text =>
+ text.includes('identification') ||
+ text.includes('식별') ||
+ text.includes('ident') ||
+ text.includes('id')
+ );
+
+ // 테이블 품질 확인
+ const hasMinimumCells = table.cells.length >= 6; // 최소 헤더 + 데이터
+ const hasReasonableConfidence = table.inferConfidence >= 0.5; // 신뢰도 기준 완화
+
+ const isRelevant = hasNo && hasIdentification && hasMinimumCells && hasReasonableConfidence;
+
+ console.log(`Table relevance check: hasNo=${hasNo}, hasIdentification=${hasIdentification}, minCells=${hasMinimumCells}, confidence=${hasReasonableConfidence} => ${isRelevant}`);
+
+ return isRelevant;
+}
+
+// 테이블 데이터 추출
+function extractTableData(table: OCRTable, imageIndex: number, tableIndex: number): ExtractedRow[] {
+ console.log(`Processing table ${tableIndex + 1} in image ${imageIndex + 1}`);
+
+ // 테이블 그리드 구축
+ const tableGrid = buildTableGrid(table);
+
+ if (tableGrid.length < 2) {
+ console.warn('Table has less than 2 rows (need header + data)');
+ return [];
+ }
+
+ console.log(`Table grid built: ${tableGrid.length} rows, ${tableGrid[0]?.length || 0} columns`);
+
+ // 헤더 행 찾기
+ const headerRowIndex = findHeaderRow(tableGrid);
+ if (headerRowIndex === -1) {
+ console.warn('No header row found');
+ return [];
+ }
+
+ console.log(`Header row found at index: ${headerRowIndex}`);
+
+ // 테이블 형식 결정
+ const headerRow = tableGrid[headerRowIndex];
+ const tableFormat = determineTableFormat(headerRow);
+ console.log(`Table format detected: ${tableFormat}`);
+
+ // 컬럼 매핑 찾기
+ const columnMapping = findColumnMapping(headerRow, tableFormat);
+ console.log('Column mapping:', columnMapping);
+
+ // 데이터 행 추출
+ const dataRows: ExtractedRow[] = [];
+
+ for (let i = headerRowIndex + 1; i < tableGrid.length; i++) {
+ const row = tableGrid[i];
+
+ if (row && row.length > 0 && !isEmptyRow(row)) {
+ try {
+ const extractedRow = extractRowData(row, tableFormat, columnMapping, imageIndex, tableIndex, i);
+ if (extractedRow && isValidRow(extractedRow)) {
+ dataRows.push(extractedRow);
+ }
+ } catch (error) {
+ console.warn(`Error processing row ${i}:`, error);
+ }
+ }
+ }
+
+ console.log(`Extracted ${dataRows.length} valid rows from table`);
+ return dataRows;
+}
+
+// 테이블 그리드 구축
+function buildTableGrid(table: OCRTable): string[][] {
+ if (!table.cells || table.cells.length === 0) {
+ return [];
+ }
+
+ const maxRow = Math.max(...table.cells.map(cell => cell.rowIndex + cell.rowSpan - 1)) + 1;
+ const maxCol = Math.max(...table.cells.map(cell => cell.columnIndex + cell.columnSpan - 1)) + 1;
+
+ const grid: string[][] = Array(maxRow).fill(null).map(() => Array(maxCol).fill(''));
+
+ // 셀 내용으로 그리드 채우기
+ table.cells.forEach(cell => {
+ const text = getCellText(cell);
+
+ for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) {
+ for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) {
+ if (grid[r] && grid[r][c] !== undefined) {
+ // 기존 텍스트가 있으면 결합
+ grid[r][c] = grid[r][c] ? `${grid[r][c]} ${text}`.trim() : text;
+ }
+ }
+ }
+ });
+
+ return grid;
+}
+
+// 셀 텍스트 추출
+function getCellText(cell: TableCell): string {
+ if (!cell.cellTextLines || cell.cellTextLines.length === 0) {
+ return '';
+ }
+
+ return cell.cellTextLines
+ .map(line =>
+ line.cellWords
+ .map(word => word.inferText || '')
+ .join(' ')
+ )
+ .join('\n')
+ .trim();
+}
+
+// 헤더 행 찾기
+function findHeaderRow(tableGrid: string[][]): number {
+ for (let i = 0; i < Math.min(3, tableGrid.length); i++) {
+ const row = tableGrid[i];
+ const rowText = row.join(' ').toLowerCase();
+
+ console.log(`Checking row ${i}: "${rowText}"`);
+
+ const hasNo = rowText.includes('no.') || rowText.includes('번호') || /\bno\b/.test(rowText);
+ const hasIdent = rowText.includes('identification') || rowText.includes('식별') || rowText.includes('ident');
+
+ if (hasNo && hasIdent) {
+ console.log(`Header row found at ${i}`);
+ return i;
+ }
+ }
+ return -1;
+}
+
+// 테이블 형식 결정
+function determineTableFormat(headerRow: string[]): 'format1' | 'format2' {
+ const headerText = headerRow.join(' ').toLowerCase();
+
+ // Format 2: Tag No와 Joint No가 분리된 컬럼
+ const hasTagNoColumn = headerText.includes('tag') && headerText.includes('no');
+ const hasJointNoColumn = headerText.includes('joint') && headerText.includes('no');
+
+ if (hasTagNoColumn && hasJointNoColumn) {
+ return 'format2';
+ }
+
+ // Format 1: Identification No에 통합
+ return 'format1';
+}
+
+// 컬럼 매핑 찾기
+function findColumnMapping(headerRow: string[], format: 'format1' | 'format2'): ColumnMapping {
+ const mapping: ColumnMapping = {
+ no: -1,
+ identification: -1,
+ tagNo: -1,
+ jointNo: -1,
+ jointType: -1,
+ weldingDate: -1
+ };
+
+ headerRow.forEach((header, index) => {
+ const lowerHeader = header.toLowerCase().trim();
+
+ console.log(`Column ${index}: "${header}" -> "${lowerHeader}"`);
+
+ if ((lowerHeader.includes('no.') || lowerHeader === 'no') &&
+ !lowerHeader.includes('identification') &&
+ !lowerHeader.includes('tag') &&
+ !lowerHeader.includes('joint')) {
+ mapping.no = index;
+ console.log(` -> Mapped to 'no'`);
+ } else if (lowerHeader.includes('identification') || lowerHeader.includes('ident')) {
+ mapping.identification = index;
+ console.log(` -> Mapped to 'identification'`);
+ } else if (lowerHeader.includes('tag') && lowerHeader.includes('no')) {
+ mapping.tagNo = index;
+ console.log(` -> Mapped to 'tagNo'`);
+ } else if (lowerHeader.includes('joint') && lowerHeader.includes('no')) {
+ mapping.jointNo = index;
+ console.log(` -> Mapped to 'jointNo'`);
+ } else if (lowerHeader.includes('joint') && lowerHeader.includes('type')) {
+ mapping.jointType = index;
+ console.log(` -> Mapped to 'jointType'`);
+ } else if (lowerHeader.includes('type') && !lowerHeader.includes('joint')) {
+ mapping.jointType = index;
+ console.log(` -> Mapped to 'jointType'`);
+ } else if (lowerHeader.includes('welding') || lowerHeader.includes('date')) {
+ mapping.weldingDate = index;
+ console.log(` -> Mapped to 'weldingDate'`);
+ }
+ });
+
+ console.log('Final column mapping:', mapping);
+ return mapping;
+}
+
+// 행 데이터 추출
+function extractRowData(
+ row: string[],
+ format: 'format1' | 'format2',
+ columnMapping: ColumnMapping,
+ imageIndex: number,
+ tableIndex: number,
+ rowIndex: number
+): ExtractedRow | null {
+
+ const extractedRow: ExtractedRow = {
+ no: '',
+ identificationNo: '',
+ tagNo: '',
+ jointNo: '',
+ jointType: '',
+ weldingDate: '',
+ confidence: 0,
+ sourceTable: tableIndex,
+ sourceRow: rowIndex
+ };
+
+ console.log(`Processing row ${rowIndex}: [${row.map(cell => `"${cell}"`).join(', ')}]`);
+
+ // No. 추출
+ if (columnMapping.no >= 0 && columnMapping.no < row.length) {
+ extractedRow.no = cleanText(row[columnMapping.no]);
+ }
+
+ if (format === 'format1') {
+ // Format 1: 통합된 identification 데이터
+ if (columnMapping.identification >= 0 && columnMapping.identification < row.length) {
+ const combinedText = row[columnMapping.identification];
+ const parsedData = parseIdentificationData(combinedText);
+ extractedRow.identificationNo = parsedData.identificationNo;
+ extractedRow.tagNo = parsedData.tagNo;
+ extractedRow.jointNo = parsedData.jointNo;
+
+ console.log(` Parsed identification: "${combinedText}" -> `, parsedData);
+ }
+ } else {
+ // Format 2: 분리된 컬럼들
+ if (columnMapping.identification >= 0 && columnMapping.identification < row.length) {
+ extractedRow.identificationNo = cleanText(row[columnMapping.identification]);
+ }
+ if (columnMapping.tagNo >= 0 && columnMapping.tagNo < row.length) {
+ extractedRow.tagNo = cleanText(row[columnMapping.tagNo]);
+ }
+ if (columnMapping.jointNo >= 0 && columnMapping.jointNo < row.length) {
+ extractedRow.jointNo = cleanText(row[columnMapping.jointNo]);
+ }
+ }
+
+ // Joint Type 추출
+ if (columnMapping.jointType >= 0 && columnMapping.jointType < row.length) {
+ extractedRow.jointType = cleanText(row[columnMapping.jointType]);
+ }
+
+ // Welding Date 추출 (컬럼 매핑이 있으면 사용, 없으면 날짜 패턴으로 찾기)
+ if (columnMapping.weldingDate >= 0 && columnMapping.weldingDate < row.length) {
+ extractedRow.weldingDate = cleanText(row[columnMapping.weldingDate]);
+ } else {
+ const dateIndex = findDateColumn(row);
+ if (dateIndex >= 0) {
+ extractedRow.weldingDate = cleanText(row[dateIndex]);
+ }
+ }
+
+ // 신뢰도 계산
+ extractedRow.confidence = calculateRowConfidence(extractedRow);
+
+ console.log(` Extracted row:`, extractedRow);
+
+ return extractedRow;
+}
+
+// Identification 데이터 파싱 (Format 1용)
+function parseIdentificationData(combinedText: string): {
+ identificationNo: string;
+ tagNo: string;
+ jointNo: string;
+} {
+ const cleanedText = cleanText(combinedText);
+
+ console.log(`Parsing identification data: "${cleanedText}"`);
+
+ // 줄바꿈으로 먼저 분리
+ const lines = cleanedText.split(/[\r\n]+/).map(line => line.trim()).filter(line => line.length > 0);
+
+ const allParts: string[] = [];
+ lines.forEach(line => {
+ // 공백과 특수문자로 분리
+ const parts = line.split(/[\s\-_]+/).filter(part => part.length > 0);
+ allParts.push(...parts);
+ });
+
+ console.log(` Split into parts:`, allParts);
+
+ if (allParts.length === 0) {
+ return { identificationNo: cleanedText, tagNo: '', jointNo: '' };
+ }
+
+ if (allParts.length === 1) {
+ return { identificationNo: allParts[0], tagNo: '', jointNo: '' };
+ }
+
+ // 길이별로 정렬하여 식별
+ const sortedParts = [...allParts].sort((a, b) => b.length - a.length);
+
+ const identificationNo = sortedParts[0]; // 가장 긴 것
+ const jointNo = allParts.find(part => part.length <= 3 && /^[A-Z0-9]+$/i.test(part)) ||
+ sortedParts[sortedParts.length - 1]; // 3글자 이하 영숫자 또는 가장 짧은 것
+ const tagNo = allParts.find(part => part !== identificationNo && part !== jointNo) || '';
+
+ const result = { identificationNo, tagNo, jointNo };
+ console.log(` Parsed result:`, result);
+
+ return result;
+}
+
+// 날짜 컬럼 찾기
+function findDateColumn(row: string[]): number {
+ const datePattern = /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/;
+
+ for (let i = 0; i < row.length; i++) {
+ if (datePattern.test(row[i])) {
+ console.log(` Found date in column ${i}: "${row[i]}"`);
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+// 텍스트 정리
+function cleanText(text: string): string {
+ return text
+ .replace(/[\r\n\t]+/g, ' ')
+ .replace(/\s+/g, ' ')
+ .trim();
+}
+
+// 빈 행 확인
+function isEmptyRow(row: string[]): boolean {
+ return row.every(cell => !cell || cell.trim().length === 0);
+}
+
+// 유효한 행 확인
+function isValidRow(row: ExtractedRow): boolean {
+ // 번호나 식별번호 중 하나라도 있으면 유효
+ const hasBasicData = !!(row.no || row.identificationNo);
+
+ // 너무 짧은 데이터는 제외 (오인식 방지)
+ const hasReasonableLength = (row.identificationNo?.length || 0) >= 3 ||
+ (row.no?.length || 0) >= 1;
+
+ return hasBasicData && hasReasonableLength;
+}
+
+// 행 신뢰도 계산
+function calculateRowConfidence(row: ExtractedRow): number {
+ let score = 0;
+ let maxScore = 0;
+
+ // 각 필드별 가중치
+ const weights = {
+ no: 1,
+ identificationNo: 3, // 가장 중요
+ tagNo: 2,
+ jointNo: 2,
+ jointType: 1,
+ weldingDate: 1
+ };
+
+ Object.entries(weights).forEach(([field, weight]) => {
+ maxScore += weight;
+ const value = row[field as keyof ExtractedRow] as string;
+
+ if (value && value.length > 0) {
+ // 기본 점수
+ score += weight * 0.5;
+
+ // 길이 보너스
+ if (field === 'identificationNo' && value.length > 10) {
+ score += weight * 0.3;
+ } else if (field === 'no' && /^\d+$/.test(value)) {
+ score += weight * 0.3;
+ } else if (field === 'weldingDate' && /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/.test(value)) {
+ score += weight * 0.3;
+ } else if (value.length > 2) {
+ score += weight * 0.2;
+ }
+ }
+ });
+
+ return maxScore > 0 ? Math.min(score / maxScore, 1) : 0;
+}
+
+// 유틸리티: OCR 결과 품질 분석
+export function analyzeOCRQuality(ocrResult: any): {
+ confidence: number;
+ tablesFound: number;
+ textQuality: number;
+ keywordCount: number;
+} {
+ let totalConfidence = 0;
+ let totalFields = 0;
+ let tablesFound = 0;
+ let relevantKeywords = 0;
+
+ const keywords = ['no.', 'identification', 'joint', 'tag', 'type', 'weld', 'date'];
+
+ if (ocrResult.images) {
+ ocrResult.images.forEach((image: any) => {
+ // 테이블 분석
+ if (image.tables) {
+ tablesFound += image.tables.length;
+ }
+
+ // 필드 신뢰도 분석
+ if (image.fields) {
+ image.fields.forEach((field: any) => {
+ const confidence = field.inferConfidence || 0;
+ const text = (field.inferText || '').toLowerCase();
+
+ totalConfidence += confidence;
+ totalFields++;
+
+ // 관련 키워드 확인
+ keywords.forEach(keyword => {
+ if (text.includes(keyword)) {
+ relevantKeywords++;
+ }
+ });
+ });
+ }
+ });
+ }
+
+ const avgConfidence = totalFields > 0 ? totalConfidence / totalFields : 0;
+ const textQuality = totalFields > 0 ? relevantKeywords / totalFields : 0;
+
+ return {
+ confidence: avgConfidence,
+ tablesFound,
+ textQuality,
+ keywordCount: relevantKeywords
+ };
+} \ No newline at end of file