diff options
| author | dujinkim <dujin.kim@dtsolution.co.kr> | 2025-06-11 12:18:38 +0000 |
|---|---|---|
| committer | dujinkim <dujin.kim@dtsolution.co.kr> | 2025-06-11 12:18:38 +0000 |
| commit | ff902243a658067fae858a615c0629aa2e0a4837 (patch) | |
| tree | 42d30e986d1cbfb282c644c01730cd053b816b7a /app/api/ocr/utils/tableExtraction.ts | |
| parent | 42e38f41cb4c0b4bf9c08b71ed087cd7f0c7fc18 (diff) | |
(대표님) 20250611 21시 15분 OCR 등
Diffstat (limited to 'app/api/ocr/utils/tableExtraction.ts')
| -rw-r--r-- | app/api/ocr/utils/tableExtraction.ts | 611 |
1 files changed, 161 insertions, 450 deletions
diff --git a/app/api/ocr/utils/tableExtraction.ts b/app/api/ocr/utils/tableExtraction.ts index ea543f8e..720e5a5f 100644 --- a/app/api/ocr/utils/tableExtraction.ts +++ b/app/api/ocr/utils/tableExtraction.ts @@ -1,7 +1,11 @@ // app/api/ocr/utils/tableExtraction.ts -// 완전한 테이블 추출 로직 구현 +// 개선된 완전 테이블 추출 로직 – Format‑1 식별번호 파싱 보강 & 중복 행 제거 -interface ExtractedRow { +/* -------------------------------------------------------------------------- */ +/* 타입 */ +/* -------------------------------------------------------------------------- */ + +export interface ExtractedRow { no: string; identificationNo: string; tagNo: string; @@ -41,516 +45,223 @@ interface ColumnMapping { weldingDate: number; } -// 메인 테이블 추출 함수 -export async function extractTablesFromOCR(ocrResult: any): Promise<ExtractedRow[][]> { - const extractedTables: ExtractedRow[][] = []; - const warnings: string[] = []; +/* -------------------------------------------------------------------------- */ +/* 메인 */ +/* -------------------------------------------------------------------------- */ - if (!ocrResult || !ocrResult.images) { - console.warn('No OCR images found in result'); - return []; - } +export async function extractTablesFromOCR (ocrResult: any): Promise<ExtractedRow[][]> { + const tables: ExtractedRow[][] = []; + if (!ocrResult?.images) return tables; - for (let imageIndex = 0; imageIndex < ocrResult.images.length; imageIndex++) { - const image = ocrResult.images[imageIndex]; - - if (!image.tables || image.tables.length === 0) { - console.warn(`No tables found in image ${imageIndex}`); - continue; - } + ocrResult.images.forEach((image: any, imgIdx: number) => { + image.tables?.forEach((table: OCRTable, tblIdx: number) => { + if (!isRelevantTable(table)) return; + const rows = extractTableData(table, imgIdx, tblIdx); + if (rows.length) tables.push(rows); + }); + }); + return tables; +} - for (let tableIndex = 0; tableIndex < image.tables.length; tableIndex++) { - const table = image.tables[tableIndex]; - - try { - if (isRelevantTable(table)) { - const extractedRows = extractTableData(table, imageIndex, tableIndex); - - if (extractedRows.length > 0) { - extractedTables.push(extractedRows); - console.log(`Successfully extracted ${extractedRows.length} rows from table ${tableIndex + 1} in image ${imageIndex + 1}`); - } else { - console.warn(`Table ${tableIndex + 1} in image ${imageIndex + 1} was identified as relevant but no data could be extracted`); - } - } else { - console.log(`Table ${tableIndex + 1} in image ${imageIndex + 1} is not relevant (no required headers found)`); - } - } catch (error) { - console.error(`Error processing table ${tableIndex + 1} in image ${imageIndex + 1}:`, error); - } - } - } +/* -------------------------------------------------------------------------- */ +/* 관련 테이블 판별 */ +/* -------------------------------------------------------------------------- */ - console.log(`Total extracted tables: ${extractedTables.length}`); - return extractedTables; +function isRelevantTable (table: OCRTable): boolean { + const headers = table.cells.filter(c => c.rowIndex < 3).map(getCellText).join(' ').toLowerCase(); + return /\bno\b|번호/.test(headers) && /identification|식별|ident|id/.test(headers); } -// 관련 테이블인지 확인 -function isRelevantTable(table: OCRTable): boolean { - if (!table.cells || table.cells.length === 0) { - return false; - } +/* -------------------------------------------------------------------------- */ +/* 표 해석 */ +/* -------------------------------------------------------------------------- */ - // 첫 3행에서 헤더 찾기 - const headerCells = table.cells.filter(cell => cell.rowIndex <= 2); - const headerTexts = headerCells - .map(cell => getCellText(cell).toLowerCase()) - .filter(text => text.length > 0); - - console.log('Header texts found:', headerTexts); - - // 필수 키워드 확인 - const hasNo = headerTexts.some(text => - text.includes('no.') || - text === 'no' || - text.includes('번호') || - text.match(/^no\.?$/i) - ); - - const hasIdentification = headerTexts.some(text => - text.includes('identification') || - text.includes('식별') || - text.includes('ident') || - text.includes('id') - ); - - // 테이블 품질 확인 - const hasMinimumCells = table.cells.length >= 6; // 최소 헤더 + 데이터 - const hasReasonableConfidence = table.inferConfidence >= 0.5; // 신뢰도 기준 완화 - - const isRelevant = hasNo && hasIdentification && hasMinimumCells && hasReasonableConfidence; - - console.log(`Table relevance check: hasNo=${hasNo}, hasIdentification=${hasIdentification}, minCells=${hasMinimumCells}, confidence=${hasReasonableConfidence} => ${isRelevant}`); - - return isRelevant; -} +function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): ExtractedRow[] { + const grid = buildGrid(table); + const headerRowIdx = findHeaderRow(grid); + if (headerRowIdx === -1) return []; -// 테이블 데이터 추출 -function extractTableData(table: OCRTable, imageIndex: number, tableIndex: number): ExtractedRow[] { - console.log(`Processing table ${tableIndex + 1} in image ${imageIndex + 1}`); - - // 테이블 그리드 구축 - const tableGrid = buildTableGrid(table); - - if (tableGrid.length < 2) { - console.warn('Table has less than 2 rows (need header + data)'); - return []; - } + const format = detectFormat(grid[headerRowIdx]); + const mapping = mapColumns(grid[headerRowIdx]); - console.log(`Table grid built: ${tableGrid.length} rows, ${tableGrid[0]?.length || 0} columns`); + const seen = new Set<string>(); + const data: ExtractedRow[] = []; - // 헤더 행 찾기 - const headerRowIndex = findHeaderRow(tableGrid); - if (headerRowIndex === -1) { - console.warn('No header row found'); - return []; - } + for (let r = headerRowIdx + 1; r < grid.length; r++) { + const row = grid[r]; + if (isBlankRow(row)) continue; - console.log(`Header row found at index: ${headerRowIndex}`); - - // 테이블 형식 결정 - const headerRow = tableGrid[headerRowIndex]; - const tableFormat = determineTableFormat(headerRow); - console.log(`Table format detected: ${tableFormat}`); - - // 컬럼 매핑 찾기 - const columnMapping = findColumnMapping(headerRow, tableFormat); - console.log('Column mapping:', columnMapping); - - // 데이터 행 추출 - const dataRows: ExtractedRow[] = []; - - for (let i = headerRowIndex + 1; i < tableGrid.length; i++) { - const row = tableGrid[i]; - - if (row && row.length > 0 && !isEmptyRow(row)) { - try { - const extractedRow = extractRowData(row, tableFormat, columnMapping, imageIndex, tableIndex, i); - if (extractedRow && isValidRow(extractedRow)) { - dataRows.push(extractedRow); - } - } catch (error) { - console.warn(`Error processing row ${i}:`, error); - } - } + const parsed = buildRow(row, format, mapping, tblIdx, r); + if (!parsed || !isValidRow(parsed)) continue; + + const key = `${parsed.no}-${parsed.identificationNo}`; + if (seen.has(key)) continue; + seen.add(key); + + data.push(parsed); } - - console.log(`Extracted ${dataRows.length} valid rows from table`); - return dataRows; + return data; } -// 테이블 그리드 구축 -function buildTableGrid(table: OCRTable): string[][] { - if (!table.cells || table.cells.length === 0) { - return []; - } +/* -------------------------------------------------------------------------- */ +/* Grid & Header */ +/* -------------------------------------------------------------------------- */ + +function buildGrid (table: OCRTable): string[][] { + const maxR = Math.max(...table.cells.map(c => c.rowIndex + c.rowSpan - 1)); + const maxC = Math.max(...table.cells.map(c => c.columnIndex + c.columnSpan - 1)); + const grid = Array.from({ length: maxR + 1 }, () => Array(maxC + 1).fill('')); - const maxRow = Math.max(...table.cells.map(cell => cell.rowIndex + cell.rowSpan - 1)) + 1; - const maxCol = Math.max(...table.cells.map(cell => cell.columnIndex + cell.columnSpan - 1)) + 1; - - const grid: string[][] = Array(maxRow).fill(null).map(() => Array(maxCol).fill('')); - - // 셀 내용으로 그리드 채우기 table.cells.forEach(cell => { - const text = getCellText(cell); - + const txt = getCellText(cell); for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) { for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) { - if (grid[r] && grid[r][c] !== undefined) { - // 기존 텍스트가 있으면 결합 - grid[r][c] = grid[r][c] ? `${grid[r][c]} ${text}`.trim() : text; - } + grid[r][c] = grid[r][c] ? `${grid[r][c]} ${txt}` : txt; } } }); - return grid; } -// 셀 텍스트 추출 -function getCellText(cell: TableCell): string { - if (!cell.cellTextLines || cell.cellTextLines.length === 0) { - return ''; - } - - return cell.cellTextLines - .map(line => - line.cellWords - .map(word => word.inferText || '') - .join(' ') - ) - .join('\n') - .trim(); +function getCellText (cell: TableCell): string { + return cell.cellTextLines?.flatMap(l => l.cellWords.map(w => w.inferText.trim())).filter(Boolean).join(' ') ?? ''; } -// 헤더 행 찾기 -function findHeaderRow(tableGrid: string[][]): number { - for (let i = 0; i < Math.min(3, tableGrid.length); i++) { - const row = tableGrid[i]; - const rowText = row.join(' ').toLowerCase(); - - console.log(`Checking row ${i}: "${rowText}"`); - - const hasNo = rowText.includes('no.') || rowText.includes('번호') || /\bno\b/.test(rowText); - const hasIdent = rowText.includes('identification') || rowText.includes('식별') || rowText.includes('ident'); - - if (hasNo && hasIdent) { - console.log(`Header row found at ${i}`); - return i; - } +function findHeaderRow (grid: string[][]): number { + for (let i = 0; i < Math.min(3, grid.length); i++) { + const t = grid[i].join(' ').toLowerCase(); + if (/\bno\b|번호/.test(t) && /identification|식별|ident/.test(t)) return i; } return -1; } -// 테이블 형식 결정 -function determineTableFormat(headerRow: string[]): 'format1' | 'format2' { - const headerText = headerRow.join(' ').toLowerCase(); - - // Format 2: Tag No와 Joint No가 분리된 컬럼 - const hasTagNoColumn = headerText.includes('tag') && headerText.includes('no'); - const hasJointNoColumn = headerText.includes('joint') && headerText.includes('no'); - - if (hasTagNoColumn && hasJointNoColumn) { - return 'format2'; - } - - // Format 1: Identification No에 통합 - return 'format1'; -} +/* -------------------------------------------------------------------------- */ +/* Column Mapping */ +/* -------------------------------------------------------------------------- */ -// 컬럼 매핑 찾기 -function findColumnMapping(headerRow: string[], format: 'format1' | 'format2'): ColumnMapping { - const mapping: ColumnMapping = { - no: -1, - identification: -1, - tagNo: -1, - jointNo: -1, - jointType: -1, - weldingDate: -1 - }; +function detectFormat (header: string[]): 'format1' | 'format2' { + const h = header.join(' ').toLowerCase(); + return h.includes('tag') && h.includes('joint') ? 'format2' : 'format1'; +} - headerRow.forEach((header, index) => { - const lowerHeader = header.toLowerCase().trim(); - - console.log(`Column ${index}: "${header}" -> "${lowerHeader}"`); - - if ((lowerHeader.includes('no.') || lowerHeader === 'no') && - !lowerHeader.includes('identification') && - !lowerHeader.includes('tag') && - !lowerHeader.includes('joint')) { - mapping.no = index; - console.log(` -> Mapped to 'no'`); - } else if (lowerHeader.includes('identification') || lowerHeader.includes('ident')) { - mapping.identification = index; - console.log(` -> Mapped to 'identification'`); - } else if (lowerHeader.includes('tag') && lowerHeader.includes('no')) { - mapping.tagNo = index; - console.log(` -> Mapped to 'tagNo'`); - } else if (lowerHeader.includes('joint') && lowerHeader.includes('no')) { - mapping.jointNo = index; - console.log(` -> Mapped to 'jointNo'`); - } else if (lowerHeader.includes('joint') && lowerHeader.includes('type')) { - mapping.jointType = index; - console.log(` -> Mapped to 'jointType'`); - } else if (lowerHeader.includes('type') && !lowerHeader.includes('joint')) { - mapping.jointType = index; - console.log(` -> Mapped to 'jointType'`); - } else if (lowerHeader.includes('welding') || lowerHeader.includes('date')) { - mapping.weldingDate = index; - console.log(` -> Mapped to 'weldingDate'`); - } +function mapColumns (header: string[]): ColumnMapping { + const mp: ColumnMapping = { no: -1, identification: -1, tagNo: -1, jointNo: -1, jointType: -1, weldingDate: -1 }; + + header.forEach((h, i) => { + const t = h.toLowerCase(); + if (/^no\.?$/.test(t) && !/ident|tag|joint/.test(t)) mp.no = i; + else if (/identification|ident/.test(t)) mp.identification = i; + else if (/tag.*no/.test(t)) mp.tagNo = i; + else if (/joint.*no/.test(t)) mp.jointNo = i; + else if (/joint.*type/.test(t) || (/^type$/.test(t) && mp.jointType === -1)) mp.jointType = i; + else if (/welding|date/.test(t)) mp.weldingDate = i; }); - - console.log('Final column mapping:', mapping); - return mapping; + return mp; } -// 행 데이터 추출 -function extractRowData( - row: string[], - format: 'format1' | 'format2', - columnMapping: ColumnMapping, - imageIndex: number, - tableIndex: number, - rowIndex: number +/* -------------------------------------------------------------------------- */ +/* Row Extraction */ +/* -------------------------------------------------------------------------- */ + +function buildRow ( + row: string[], + format: 'format1' | 'format2', + mp: ColumnMapping, + tblIdx: number, + rowIdx: number ): ExtractedRow | null { - - const extractedRow: ExtractedRow = { - no: '', + const out: ExtractedRow = { + no: mp.no >= 0 ? clean(row[mp.no]) : '', identificationNo: '', tagNo: '', jointNo: '', - jointType: '', + jointType: mp.jointType >= 0 ? clean(row[mp.jointType]) : '', weldingDate: '', confidence: 0, - sourceTable: tableIndex, - sourceRow: rowIndex + sourceTable: tblIdx, + sourceRow: rowIdx, }; - console.log(`Processing row ${rowIndex}: [${row.map(cell => `"${cell}"`).join(', ')}]`); - - // No. 추출 - if (columnMapping.no >= 0 && columnMapping.no < row.length) { - extractedRow.no = cleanText(row[columnMapping.no]); + if (mp.weldingDate >= 0) out.weldingDate = clean(row[mp.weldingDate]); + else { + const idx = row.findIndex(col => /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/.test(col)); + if (idx >= 0) out.weldingDate = clean(row[idx]); } - if (format === 'format1') { - // Format 1: 통합된 identification 데이터 - if (columnMapping.identification >= 0 && columnMapping.identification < row.length) { - const combinedText = row[columnMapping.identification]; - const parsedData = parseIdentificationData(combinedText); - extractedRow.identificationNo = parsedData.identificationNo; - extractedRow.tagNo = parsedData.tagNo; - extractedRow.jointNo = parsedData.jointNo; - - console.log(` Parsed identification: "${combinedText}" -> `, parsedData); - } + if (format === 'format2') { + if (mp.identification >= 0) out.identificationNo = clean(row[mp.identification]); + if (mp.jointNo >= 0) out.jointNo = clean(row[mp.jointNo]); + if (mp.tagNo >= 0) out.tagNo = clean(row[mp.tagNo]); } else { - // Format 2: 분리된 컬럼들 - if (columnMapping.identification >= 0 && columnMapping.identification < row.length) { - extractedRow.identificationNo = cleanText(row[columnMapping.identification]); - } - if (columnMapping.tagNo >= 0 && columnMapping.tagNo < row.length) { - extractedRow.tagNo = cleanText(row[columnMapping.tagNo]); - } - if (columnMapping.jointNo >= 0 && columnMapping.jointNo < row.length) { - extractedRow.jointNo = cleanText(row[columnMapping.jointNo]); - } + const combined = mp.identification >= 0 ? row[mp.identification] : ''; + const parsed = parseIdentificationData(combined); + out.identificationNo = parsed.identificationNo; + out.jointNo = parsed.jointNo; + out.tagNo = parsed.tagNo; } - // Joint Type 추출 - if (columnMapping.jointType >= 0 && columnMapping.jointType < row.length) { - extractedRow.jointType = cleanText(row[columnMapping.jointType]); - } - - // Welding Date 추출 (컬럼 매핑이 있으면 사용, 없으면 날짜 패턴으로 찾기) - if (columnMapping.weldingDate >= 0 && columnMapping.weldingDate < row.length) { - extractedRow.weldingDate = cleanText(row[columnMapping.weldingDate]); - } else { - const dateIndex = findDateColumn(row); - if (dateIndex >= 0) { - extractedRow.weldingDate = cleanText(row[dateIndex]); - } - } - - // 신뢰도 계산 - extractedRow.confidence = calculateRowConfidence(extractedRow); - - console.log(` Extracted row:`, extractedRow); - - return extractedRow; + out.confidence = scoreRow(out); + return out; } -// Identification 데이터 파싱 (Format 1용) -function parseIdentificationData(combinedText: string): { - identificationNo: string; - tagNo: string; - jointNo: string; -} { - const cleanedText = cleanText(combinedText); - - console.log(`Parsing identification data: "${cleanedText}"`); - - // 줄바꿈으로 먼저 분리 - const lines = cleanedText.split(/[\r\n]+/).map(line => line.trim()).filter(line => line.length > 0); - - const allParts: string[] = []; - lines.forEach(line => { - // 공백과 특수문자로 분리 - const parts = line.split(/[\s\-_]+/).filter(part => part.length > 0); - allParts.push(...parts); - }); +/* -------------------------------------------------------------------------- */ +/* Format‑1 셀 파싱 */ +/* -------------------------------------------------------------------------- */ - console.log(` Split into parts:`, allParts); +function parseIdentificationData (txt: string): { identificationNo: string; jointNo: string; tagNo: string } { + const cleaned = clean(txt); + if (!cleaned) return { identificationNo: '', jointNo: '', tagNo: '' }; - if (allParts.length === 0) { - return { identificationNo: cleanedText, tagNo: '', jointNo: '' }; - } + const tokens = cleaned.split(/\s+/).map(clean).filter(Boolean); - if (allParts.length === 1) { - return { identificationNo: allParts[0], tagNo: '', jointNo: '' }; - } + // Identification 후보: 하이픈이 2개 이상 포함된 토큰 가운데 가장 긴 것 + const idCand = tokens.filter(t => t.split('-').length >= 3).sort((a, b) => b.length - a.length); + const identificationNo = idCand[0] || ''; - // 길이별로 정렬하여 식별 - const sortedParts = [...allParts].sort((a, b) => b.length - a.length); - - const identificationNo = sortedParts[0]; // 가장 긴 것 - const jointNo = allParts.find(part => part.length <= 3 && /^[A-Z0-9]+$/i.test(part)) || - sortedParts[sortedParts.length - 1]; // 3글자 이하 영숫자 또는 가장 짧은 것 - const tagNo = allParts.find(part => part !== identificationNo && part !== jointNo) || ''; - - const result = { identificationNo, tagNo, jointNo }; - console.log(` Parsed result:`, result); - - return result; -} + const residual = tokens.filter(t => t !== identificationNo); + if (!residual.length) return { identificationNo, jointNo: '', tagNo: '' }; -// 날짜 컬럼 찾기 -function findDateColumn(row: string[]): number { - const datePattern = /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/; - - for (let i = 0; i < row.length; i++) { - if (datePattern.test(row[i])) { - console.log(` Found date in column ${i}: "${row[i]}"`); - return i; - } - } - - return -1; -} + residual.sort((a, b) => a.length - b.length); + const jointNo = residual[0] || ''; + const tagNo = residual[residual.length - 1] || ''; -// 텍스트 정리 -function cleanText(text: string): string { - return text - .replace(/[\r\n\t]+/g, ' ') - .replace(/\s+/g, ' ') - .trim(); + return { identificationNo, jointNo, tagNo }; } -// 빈 행 확인 -function isEmptyRow(row: string[]): boolean { - return row.every(cell => !cell || cell.trim().length === 0); +/* -------------------------------------------------------------------------- */ +/* Helpers */ +/* -------------------------------------------------------------------------- */ + +const clean = (s: string = '') => s.replace(/[\r\n\t]+/g, ' ').replace(/\s+/g, ' ').trim(); +const isBlankRow = (row: string[]) => row.every(c => !clean(c)); +const isValidRow = (r: ExtractedRow) => !!(r.no || r.identificationNo); + +function scoreRow (r: ExtractedRow): number { + const w: Record<keyof ExtractedRow, number> = { + no: 1, identificationNo: 3, tagNo: 2, jointNo: 2, jointType: 1, weldingDate: 1, + confidence: 0, sourceTable: 0, sourceRow: 0, + } as any; + let s = 0, t = 0; + (Object.keys(w) as (keyof ExtractedRow)[]).forEach(k => { t += w[k]; if ((r[k] as string)?.length) s += w[k]; }); + return t ? s / t : 0; } -// 유효한 행 확인 -function isValidRow(row: ExtractedRow): boolean { - // 번호나 식별번호 중 하나라도 있으면 유효 - const hasBasicData = !!(row.no || row.identificationNo); - - // 너무 짧은 데이터는 제외 (오인식 방지) - const hasReasonableLength = (row.identificationNo?.length || 0) >= 3 || - (row.no?.length || 0) >= 1; - - return hasBasicData && hasReasonableLength; -} +/* -------------------------------------------------------------------------- */ +/* OCR 품질 분석 (기존 로직 유지) */ +/* -------------------------------------------------------------------------- */ -// 행 신뢰도 계산 -function calculateRowConfidence(row: ExtractedRow): number { - let score = 0; - let maxScore = 0; - - // 각 필드별 가중치 - const weights = { - no: 1, - identificationNo: 3, // 가장 중요 - tagNo: 2, - jointNo: 2, - jointType: 1, - weldingDate: 1 - }; +export function analyzeOCRQuality (ocrResult: any) { + let conf = 0, cnt = 0, tbl = 0, kw = 0; + const keys = ['no.', 'identification', 'joint', 'tag', 'type', 'weld', 'date']; - Object.entries(weights).forEach(([field, weight]) => { - maxScore += weight; - const value = row[field as keyof ExtractedRow] as string; - - if (value && value.length > 0) { - // 기본 점수 - score += weight * 0.5; - - // 길이 보너스 - if (field === 'identificationNo' && value.length > 10) { - score += weight * 0.3; - } else if (field === 'no' && /^\d+$/.test(value)) { - score += weight * 0.3; - } else if (field === 'weldingDate' && /\d{4}[.\-\/]\d{1,2}[.\-\/]\d{1,2}/.test(value)) { - score += weight * 0.3; - } else if (value.length > 2) { - score += weight * 0.2; - } - } + ocrResult.images?.forEach((img: any) => { + tbl += img.tables?.length || 0; + img.fields?.forEach((f: any) => { + conf += f.inferConfidence || 0; cnt++; + const t = (f.inferText || '').toLowerCase(); + keys.forEach(k => { if (t.includes(k)) kw++; }); + }); }); - return maxScore > 0 ? Math.min(score / maxScore, 1) : 0; + return { confidence: cnt ? conf / cnt : 0, tablesFound: tbl, textQuality: cnt ? kw / cnt : 0, keywordCount: kw }; } - -// 유틸리티: OCR 결과 품질 분석 -export function analyzeOCRQuality(ocrResult: any): { - confidence: number; - tablesFound: number; - textQuality: number; - keywordCount: number; -} { - let totalConfidence = 0; - let totalFields = 0; - let tablesFound = 0; - let relevantKeywords = 0; - - const keywords = ['no.', 'identification', 'joint', 'tag', 'type', 'weld', 'date']; - - if (ocrResult.images) { - ocrResult.images.forEach((image: any) => { - // 테이블 분석 - if (image.tables) { - tablesFound += image.tables.length; - } - - // 필드 신뢰도 분석 - if (image.fields) { - image.fields.forEach((field: any) => { - const confidence = field.inferConfidence || 0; - const text = (field.inferText || '').toLowerCase(); - - totalConfidence += confidence; - totalFields++; - - // 관련 키워드 확인 - keywords.forEach(keyword => { - if (text.includes(keyword)) { - relevantKeywords++; - } - }); - }); - } - }); - } - - const avgConfidence = totalFields > 0 ? totalConfidence / totalFields : 0; - const textQuality = totalFields > 0 ? relevantKeywords / totalFields : 0; - - return { - confidence: avgConfidence, - tablesFound, - textQuality, - keywordCount: relevantKeywords - }; -}
\ No newline at end of file |
