diff options
Diffstat (limited to 'app/api/ocr/utils/tableExtraction.ts')
| -rw-r--r-- | app/api/ocr/utils/tableExtraction.ts | 651 |
1 files changed, 55 insertions, 596 deletions
diff --git a/app/api/ocr/utils/tableExtraction.ts b/app/api/ocr/utils/tableExtraction.ts index 0a727f84..3c44e7fb 100644 --- a/app/api/ocr/utils/tableExtraction.ts +++ b/app/api/ocr/utils/tableExtraction.ts @@ -69,107 +69,40 @@ export async function extractTablesFromOCR (ocrResult: any): Promise<ExtractedRo function isRelevantTable (table: OCRTable): boolean { const headers = table.cells.filter(c => c.rowIndex < 3).map(getCellText).join(' ').toLowerCase(); - console.log(`๐ Checking table relevance. Headers: "${headers}"`); - - // ๊ธฐ์กด ์กฐ๊ฑด - const hasNoColumn = /\bno\b|๋ฒํธ/.test(headers); - const hasIdentification = /identification|์๋ณ|ident|id/.test(headers); - - console.log(`๐ Has NO column: ${hasNoColumn}`); - console.log(`๐ Has Identification: ${hasIdentification}`); - - // ๊ธฐ๋ณธ ์กฐ๊ฑด - if (hasNoColumn && hasIdentification) { - console.log(`โ
Table passes strict criteria`); - return true; - } - - // ์ํ๋ ์กฐ๊ฑด๋ค - const relaxedConditions = [ - // ์กฐ๊ฑด 1: ํ
์ด๋ธ์ ์ฌ๋ฌ ์ด์ด ์๊ณ ์ซ์๋ ์๋ณ์ ํจํด์ด ๋ณด์ด๋ ๊ฒฝ์ฐ - table.cells.length > 10 && /\d+/.test(headers), - - // ์กฐ๊ฑด 2: joint, tag, weld ๋ฑ ๊ด๋ จ ํค์๋๊ฐ ์๋ ๊ฒฝ์ฐ - /joint|tag|weld|type|date/.test(headers), - - // ์กฐ๊ฑด 3: ์๋ณ๋ฒํธ ํจํด์ด ๋ณด์ด๋ ๊ฒฝ์ฐ (ํ์ดํ์ด ํฌํจ๋ ๋ฌธ์์ด) - headers.includes('-') && headers.length > 20, - - // ์กฐ๊ฑด 4: ํ๊ตญ์ด ๊ด๋ จ ํค์๋ - /์ฉ์ |์กฐ์ธํธ|ํ๊ทธ/.test(headers) - ]; - - const passedConditions = relaxedConditions.filter(Boolean).length; - console.log(`๐ Relaxed conditions passed: ${passedConditions}/${relaxedConditions.length}`); - - if (passedConditions >= 1) { - console.log(`โ
Table passes relaxed criteria`); - return true; - } - - console.log(`โ Table does not meet any criteria`); - return false; + return /\bno\b|๋ฒํธ/.test(headers) && /identification|์๋ณ|ident|id/.test(headers); } + /* -------------------------------------------------------------------------- */ /* ํ ํด์ */ /* -------------------------------------------------------------------------- */ function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): ExtractedRow[] { - console.log(`๐ง Starting extractTableData for table ${imgIdx}-${tblIdx}`); - const grid = buildGrid(table); - console.log(`๐ Grid size: ${grid.length} rows x ${grid[0]?.length || 0} columns`); - const headerRowIdx = findHeaderRow(grid); - console.log(`๐ Header row index: ${headerRowIdx}`); - if (headerRowIdx === -1) { - console.log(`โ No header row found`); - return []; - } + console.log(headerRowIdx,"headerRowIdx") + + if (headerRowIdx === -1) return []; - const format = detectFormat(grid[headerRowIdx]); - const mapping = mapColumns(grid[headerRowIdx]); - - console.log(`๐ Detected format: ${format}`); - console.log(`๐๏ธ Column mapping:`, mapping); + const format = detectFormat(grid[headerRowIdx]); + const mapping = mapColumns(grid[headerRowIdx]); const seen = new Set<string>(); const data: ExtractedRow[] = []; for (let r = headerRowIdx + 1; r < grid.length; r++) { const row = grid[r]; - - if (isBlankRow(row)) { - console.log(`โญ๏ธ Row ${r}: blank, skipping`); - continue; - } - - console.log(`๐ Processing row ${r}: [${row.join(' | ')}]`); + if (isBlankRow(row)) continue; const parsed = buildRow(row, format, mapping, tblIdx, r); - if (!parsed) { - console.log(`โ Row ${r}: failed to parse`); - continue; - } - - if (!isValidRow(parsed)) { - console.log(`โ Row ${r}: invalid (no: "${parsed.no}", id: "${parsed.identificationNo}")`); - continue; - } + if (!parsed || !isValidRow(parsed)) continue; const key = `${parsed.no}-${parsed.identificationNo}`; - if (seen.has(key)) { - console.log(`โ ๏ธ Row ${r}: duplicate key "${key}", skipping`); - continue; - } - + if (seen.has(key)) continue; seen.add(key); + data.push(parsed); - console.log(`โ
Row ${r}: added (${JSON.stringify(parsed)})`); } - - console.log(`๐ฏ Table ${imgIdx}-${tblIdx}: extracted ${data.length} valid rows`); return data; } @@ -178,39 +111,18 @@ function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): Ext /* -------------------------------------------------------------------------- */ function buildGrid (table: OCRTable): string[][] { - console.log(`๐ง Building grid from ${table.cells.length} cells`); - const maxR = Math.max(...table.cells.map(c => c.rowIndex + c.rowSpan - 1)); const maxC = Math.max(...table.cells.map(c => c.columnIndex + c.columnSpan - 1)); - - console.log(`๐ Grid dimensions: ${maxR + 1} rows x ${maxC + 1} columns`); - const grid = Array.from({ length: maxR + 1 }, () => Array(maxC + 1).fill('')); - // ์
๋ณ ์์ธ ์ ๋ณด ์ถ๋ ฅ - table.cells.forEach((cell, idx) => { + table.cells.forEach(cell => { const txt = getCellText(cell); - console.log(`๐ฑ Cell ${idx}: (${cell.rowIndex},${cell.columnIndex}) span(${cell.rowSpan},${cell.columnSpan}) = "${txt}"`); - for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) { for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) { - const oldValue = grid[r][c]; - const newValue = oldValue ? `${oldValue} ${txt}` : txt; - grid[r][c] = newValue; - - if (oldValue) { - console.log(`๐ Grid[${r}][${c}]: "${oldValue}" โ "${newValue}"`); - } + grid[r][c] = grid[r][c] ? `${grid[r][c]} ${txt}` : txt; } } }); - - // ์ต์ข
๊ทธ๋ฆฌ๋ ์ถ๋ ฅ - console.log(`๐ Final grid:`); - grid.forEach((row, r) => { - console.log(` Row ${r}: [${row.map(cell => `"${cell}"`).join(', ')}]`); - }); - return grid; } @@ -219,52 +131,13 @@ function getCellText (cell: TableCell): string { } function findHeaderRow (grid: string[][]): number { - console.log(`๐ Finding header row in grid with ${grid.length} rows`); - - for (let i = 0; i < Math.min(5, grid.length); i++) { - const rowText = grid[i].join(' ').toLowerCase(); - console.log(`๐ Row ${i}: "${rowText}"`); - - // ๊ธฐ์กด ์๊ฒฉํ ์กฐ๊ฑด - if (/\bno\b|๋ฒํธ/.test(rowText) && /identification|์๋ณ|ident/.test(rowText)) { - console.log(`โ
Row ${i}: Strict match`); - return i; - } - - // ์ํ๋ ์กฐ๊ฑด๋ค - const relaxedMatches = [ - // 1. NO ์ปฌ๋ผ + ๋ค๋ฅธ ๊ด๋ จ ํค์๋ - (/\bno\b|๋ฒํธ/.test(rowText) && /joint|tag|type|weld|date/.test(rowText)), - - // 2. ID/์๋ณ + ๋ค๋ฅธ ๊ด๋ จ ํค์๋ - (/identification|์๋ณ|ident|id/.test(rowText) && /joint|tag|no|type/.test(rowText)), - - // 3. ์ฉ์ ๊ด๋ จ ํค์๋๊ฐ ์ฌ๋ฌ ๊ฐ - (rowText.match(/joint|tag|type|weld|date|no|id|์๋ณ|๋ฒํธ|์ฉ์ /g)?.length >= 3), - - // 4. ์ฒซ ๋ฒ์งธ ํ์ด๊ณ ์ฌ๋ฌ ๋จ์ด๊ฐ ์๋ ๊ฒฝ์ฐ - (i === 0 && rowText.split(/\s+/).filter(w => w.length > 1).length >= 3) - ]; - - if (relaxedMatches.some(Boolean)) { - console.log(`โ
Row ${i}: Relaxed match`); - return i; - } - - console.log(`โ Row ${i}: No match`); - } - - // ์ตํ์ ์๋จ: ์ฒซ ๋ฒ์งธ ๋น์ด์์ง ์์ ํ for (let i = 0; i < Math.min(3, grid.length); i++) { - if (grid[i].some(cell => cell.trim().length > 0)) { - console.log(`โ ๏ธ Using row ${i} as fallback header`); - return i; - } + const t = grid[i].join(' ').toLowerCase(); + if (/\bno\b|๋ฒํธ/.test(t) && /identification|์๋ณ|ident/.test(t)) return i; } - - console.log(`โ No header row found`); return -1; } + /* -------------------------------------------------------------------------- */ /* Column Mapping */ /* -------------------------------------------------------------------------- */ @@ -276,153 +149,19 @@ function detectFormat (header: string[]): 'format1' | 'format2' { function mapColumns (header: string[]): ColumnMapping { const mp: ColumnMapping = { no: -1, identification: -1, tagNo: -1, jointNo: -1, jointType: -1, weldingDate: -1 }; - - console.log(`๐๏ธ Smart mapping columns from header: [${header.map(h => `"${h}"`).join(', ')}]`); - // === STEP 1: ๊ธฐ์กด ๊ฐ๋ณ ์ปฌ๋ผ ๋งคํ === header.forEach((h, i) => { - const t = h.toLowerCase().trim(); - console.log(`๐ Column ${i}: "${h}" โ "${t}"`); - - if (mp.no === -1 && (/^no\.?$/i.test(t) || /^๋ฒํธ$/i.test(t) || /^์๋ฒ$/i.test(t))) { - mp.no = i; - console.log(`โ
NO column (individual) mapped to index ${i}`); - } - - if (mp.identification === -1 && (/identification.*no/i.test(t) || /์๋ณ.*๋ฒํธ/i.test(t))) { - mp.identification = i; - console.log(`โ
Identification column (individual) mapped to index ${i}`); - } - - if (mp.tagNo === -1 && (/tag.*no/i.test(t) || /ํ๊ทธ.*๋ฒํธ/i.test(t))) { - mp.tagNo = i; - console.log(`โ
Tag No column (individual) mapped to index ${i}`); - } - - if (mp.jointNo === -1 && (/joint.*no/i.test(t) || /์กฐ์ธํธ.*๋ฒํธ/i.test(t) || /oint.*no/i.test(t))) { - mp.jointNo = i; - console.log(`โ
Joint No column (individual) mapped to index ${i}`); - } - - if (mp.jointType === -1 && (/joint.*type/i.test(t) || /^type$/i.test(t) || /ํํ/i.test(t))) { - mp.jointType = i; - console.log(`โ
Joint Type column (individual) mapped to index ${i}`); - } - - if (mp.weldingDate === -1 && (/welding.*date/i.test(t) || /weld.*date/i.test(t) || /^date$/i.test(t) || /๋ ์ง/i.test(t))) { - mp.weldingDate = i; - console.log(`โ
Welding Date column (individual) mapped to index ${i}`); - } - }); - - // === STEP 2: ์ค์ฉ์ ์ถ๋ก === - console.log(`๐ค Starting practical column inference...`); - - // NO ์ปฌ๋ผ์ด ๋งคํ๋์ง ์์๋ค๋ฉด, ์ฒซ ๋ฒ์งธ ์ปฌ๋ผ์ NO๋ก ์ถ์ - if (mp.no === -1) { - mp.no = 0; - console.log(`๐ฎ NO column inferred as index 0 (first column)`); - } - - // Identification ์ปฌ๋ผ ์ฐพ๊ธฐ - "identification" ํค์๋๊ฐ ํฌํจ๋ ์ปฌ๋ผ ์ค์์ - if (mp.identification === -1) { - for (let i = 0; i < header.length; i++) { - const text = header[i].toLowerCase(); - if (text.includes('identification') || text.includes('์๋ณ')) { - mp.identification = i; - console.log(`๐ Identification column found at index ${i}`); - break; - } - } - } - - // Tag No ์ปฌ๋ผ ์ฐพ๊ธฐ - "tag" ํค์๋๊ฐ ํฌํจ๋ ์ปฌ๋ผ ์ค์์ - if (mp.tagNo === -1) { - for (let i = 0; i < header.length; i++) { - const text = header[i].toLowerCase(); - if (text.includes('tag') && !text.includes('no')) { - mp.tagNo = i; - console.log(`๐ท๏ธ Tag column found at index ${i}`); - break; - } - } - } - - // Joint No ์ปฌ๋ผ ์ฐพ๊ธฐ - if (mp.jointNo === -1) { - for (let i = 0; i < header.length; i++) { - const text = header[i].toLowerCase(); - if (text.includes('joint') || text.includes('oint')) { - mp.jointNo = i; - console.log(`๐ Joint column found at index ${i}`); - break; - } - } - } - - // === STEP 3: ํจํด ๊ธฐ๋ฐ ์ถ๋ก (๋ง์ง๋ง ์๋จ) === - console.log(`๐ฏ Pattern-based fallback mapping...`); - - // ์ ์ฒด ํค๋์์ ์ค์ ์๋ณ๋ฒํธ ํจํด์ด ์๋ ์ปฌ๋ผ ์ฐพ๊ธฐ - if (mp.identification === -1) { - for (let i = 0; i < header.length; i++) { - const text = header[i]; - // ํ์ดํ์ด ํฌํจ๋ ๊ธด ๋ฌธ์์ด์ด ์๋ ์ปฌ๋ผ - if (text.includes('-') && text.length > 15) { - mp.identification = i; - console.log(`๐ Identification inferred at index ${i} (contains ID pattern)`); - break; - } - } - } - - // ์ซ์ ํจํด์ด ์๋ ์ปฌ๋ผ์ Tag No๋ก ์ถ์ - if (mp.tagNo === -1) { - for (let i = 1; i < header.length; i++) { // ์ฒซ ๋ฒ์งธ ์ปฌ๋ผ ์ ์ธ - const text = header[i]; - // 7-8์๋ฆฌ ์ซ์๊ฐ ์๋ ์ปฌ๋ผ - if (/\d{7,8}/.test(text)) { - mp.tagNo = i; - console.log(`๐ท๏ธ Tag No inferred at index ${i} (contains number pattern)`); - break; - } - } - } - - // === STEP 4: ๊ธฐ๋ณธ๊ฐ ์ค์ === - console.log(`๐ง Setting default values for unmapped columns...`); - - // ์ฌ์ ํ ๋งคํ๋์ง ์์ ์ค์ํ ์ปฌ๋ผ๋ค์ ๋ํด ์์ ๊ธฐ๋ฐ ์ถ์ - const essentialColumns = [ - { key: 'identification', currentValue: mp.identification, defaultIndex: 1 }, - { key: 'tagNo', currentValue: mp.tagNo, defaultIndex: 2 }, - { key: 'jointNo', currentValue: mp.jointNo, defaultIndex: 3 }, - { key: 'jointType', currentValue: mp.jointType, defaultIndex: 4 }, - { key: 'weldingDate', currentValue: mp.weldingDate, defaultIndex: Math.min(5, header.length - 1) } - ]; - - essentialColumns.forEach(col => { - if ((col.currentValue as number) === -1 && col.defaultIndex < header.length) { - (mp as any)[col.key] = col.defaultIndex; - console.log(`๐ง ${col.key} set to default index ${col.defaultIndex}`); - } + const t = h.toLowerCase(); + if (/^no\.?$/.test(t) && !/ident|tag|joint/.test(t)) mp.no = i; + else if (/identification|ident/.test(t)) mp.identification = i; + else if (/tag.*no/.test(t)) mp.tagNo = i; + else if (/joint.*no/.test(t)) mp.jointNo = i; + else if (/joint.*type/.test(t) || (/^type$/.test(t) && mp.jointType === -1)) mp.jointType = i; + else if (/welding|date/.test(t)) mp.weldingDate = i; }); - - console.log(`๐ฏ Final optimized column mapping:`, mp); - - // === STEP 5: ๋งคํ ํ์ง ๊ฒ์ฆ === - const mappedCount = Object.values(mp).filter(v => v !== -1).length; - const totalColumns = Object.keys(mp).length; - const mappingQuality = mappedCount / totalColumns; - - console.log(`๐ Mapping quality: ${mappedCount}/${totalColumns} (${(mappingQuality * 100).toFixed(1)}%)`); - - if (mappingQuality < 0.5) { - console.warn(`โ ๏ธ Low mapping quality detected. Consider manual adjustment.`); - } - return mp; } + /* -------------------------------------------------------------------------- */ /* Row Extraction */ /* -------------------------------------------------------------------------- */ @@ -434,351 +173,71 @@ function buildRow ( tblIdx: number, rowIdx: number ): ExtractedRow | null { - console.log(`๐จ Building row from: [${row.map(r => `"${r}"`).join(', ')}]`); - console.log(`๐ Using mapping:`, mp); - console.log(`๐ Format: ${format}`); - const out: ExtractedRow = { - no: '', + no: mp.no >= 0 ? clean(row[mp.no]) : '', identificationNo: '', tagNo: '', jointNo: '', - jointType: '', + jointType: mp.jointType >= 0 ? clean(row[mp.jointType]) : '', weldingDate: '', confidence: 0, sourceTable: tblIdx, sourceRow: rowIdx, }; - // === STEP 1: ๋งคํ๋ ์ปฌ๋ผ์์ ๊ธฐ๋ณธ ์ถ์ถ === - - // NO ์ปฌ๋ผ ์ถ์ถ - if (mp.no >= 0 && mp.no < row.length) { - const rawNo = clean(row[mp.no]); - // NO ํ๋์์ ์ฒซ ๋ฒ์งธ ์ซ์ ํจํด ์ถ์ถ - const noMatch = rawNo.match(/\b(\d{2,4})\b/); - out.no = noMatch ? noMatch[1] : rawNo; - console.log(`๐ NO from column ${mp.no}: "${out.no}" (raw: "${rawNo}")`); - } - - // Joint Type, Welding Date๋ ๊ธฐ์กด๋๋ก - if (mp.jointType >= 0 && mp.jointType < row.length) { - out.jointType = clean(row[mp.jointType]); - console.log(`๐ Joint Type from column ${mp.jointType}: "${out.jointType}"`); + if (mp.weldingDate >= 0) out.weldingDate = clean(row[mp.weldingDate]); + else { + const idx = row.findIndex(col => /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/.test(col)); + if (idx >= 0) out.weldingDate = clean(row[idx]); } - if (mp.weldingDate >= 0 && mp.weldingDate < row.length) { - out.weldingDate = clean(row[mp.weldingDate]); - console.log(`๐
Welding Date from column ${mp.weldingDate}: "${out.weldingDate}"`); - } - - // === STEP 2: Format๋ณ ๋ฐ์ดํฐ ์ถ์ถ === - if (format === 'format2') { - console.log(`๐ Processing Format 2 (separate columns)`); - - if (mp.identification >= 0 && mp.identification < row.length) { - out.identificationNo = clean(row[mp.identification]); - console.log(`๐ Identification from column ${mp.identification}: "${out.identificationNo}"`); - } - - if (mp.jointNo >= 0 && mp.jointNo < row.length) { - out.jointNo = clean(row[mp.jointNo]); - console.log(`๐ Joint No from column ${mp.jointNo}: "${out.jointNo}"`); - } - - if (mp.tagNo >= 0 && mp.tagNo < row.length) { - out.tagNo = clean(row[mp.tagNo]); - console.log(`๐ท๏ธ Tag No from column ${mp.tagNo}: "${out.tagNo}"`); - } + if (mp.identification >= 0) out.identificationNo = clean(row[mp.identification]); + if (mp.jointNo >= 0) out.jointNo = clean(row[mp.jointNo]); + if (mp.tagNo >= 0) out.tagNo = clean(row[mp.tagNo]); } else { - console.log(`๐ Processing Format 1 (combined identification column)`); - - let combinedText = ''; - - // ๋งคํ๋ identification ์ปฌ๋ผ์์ ํ
์คํธ ๊ฐ์ ธ์ค๊ธฐ - if (mp.identification >= 0 && mp.identification < row.length) { - combinedText = row[mp.identification]; - console.log(`๐ Combined text from column ${mp.identification}: "${combinedText}"`); - } - - const parsed = parseIdentificationData(combinedText); + const combined = mp.identification >= 0 ? row[mp.identification] : ''; + const parsed = parseIdentificationData(combined); out.identificationNo = parsed.identificationNo; - out.jointNo = parsed.jointNo; - out.tagNo = parsed.tagNo; - - console.log(`๐ Parsed from identification column:`, parsed); + out.jointNo = parsed.jointNo; + out.tagNo = parsed.tagNo; } - // === STEP 3: ์ ๊ทน์ ํจํด ๋งค์นญ์ผ๋ก ๋๋ฝ๋ ํ๋ ์ฑ์ฐ๊ธฐ === - console.log(`๐ Aggressive pattern matching for missing fields...`); - - const allText = row.join(' '); - console.log(`๐ Full row text: "${allText}"`); - - // NO ํ๋๊ฐ ๋น์ด์๋ค๋ฉด ์ฒซ ๋ฒ์งธ ์ปฌ๋ผ์์ ์ซ์ ํจํด ์ฐพ๊ธฐ - if (!out.no && row.length > 0) { - const firstCol = clean(row[0]); - const noPatterns = [ - /\b(\d{3})\b/g, // 3์๋ฆฌ ์ซ์ - /\b(\d{2,4})\b/g, // 2-4์๋ฆฌ ์ซ์ - /^(\d+)/ // ๋งจ ์ ์ซ์ - ]; - - for (const pattern of noPatterns) { - const matches = firstCol.match(pattern); - if (matches && matches.length > 0) { - out.no = matches[0].replace(/\D/g, ''); // ์ซ์๋ง ์ถ์ถ - console.log(`๐ NO found via pattern in first column: "${out.no}"`); - break; - } - } - } - - // Identification No ํจํด ์ฐพ๊ธฐ (ํ์ดํ์ด ํฌํจ๋ ๊ธด ๋ฌธ์์ด) - if (!out.identificationNo) { - const idPatterns = [ - /[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9\-]+/g, - /-\d+[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9]+/g, - /\b[A-Z]\d+[A-Z]-\d+-\d+-[A-Z]+-\d+-[A-Z0-9]+-[A-Z]-[A-Z0-9]+\b/g - ]; - - for (const pattern of idPatterns) { - const matches = allText.match(pattern); - if (matches && matches.length > 0) { - out.identificationNo = matches[0]; - console.log(`๐ Identification found via pattern: "${out.identificationNo}"`); - break; - } - } - } - - // Tag No ํจํด ์ฐพ๊ธฐ (7-8์๋ฆฌ ์ซ์) - if (!out.tagNo) { - const tagMatches = allText.match(/\b\d{7,8}\b/g); - if (tagMatches && tagMatches.length > 0) { - out.tagNo = tagMatches[0]; - console.log(`๐ท๏ธ Tag found via pattern: "${out.tagNo}"`); - } - } - - // Joint No ํจํด ์ฐพ๊ธฐ (์งง์ ์์ซ์ ์กฐํฉ) - if (!out.jointNo) { - const jointPatterns = [ - /\b[A-Z]{2,4}\d*\b/g, // ๋๋ฌธ์+์ซ์ ์กฐํฉ - /\b[A-Za-z0-9]{2,6}\b/g // ์ผ๋ฐ์ ์ธ ์งง์ ์กฐํฉ - ]; - - for (const pattern of jointPatterns) { - const matches = allText.match(pattern); - if (matches) { - const candidates = matches.filter(m => - m !== out.no && - m !== out.tagNo && - m !== out.identificationNo && - m.length >= 2 && m.length <= 6 && - !/^(no|tag|joint|type|date|welding|project|samsung|class)$/i.test(m) - ); - - if (candidates.length > 0) { - out.jointNo = candidates[0]; - console.log(`๐ Joint found via pattern: "${out.jointNo}"`); - break; - } - } - } - } - - // Welding Date ํจํด ์ฐพ๊ธฐ - if (!out.weldingDate) { - const datePatterns = [ - /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/g, - /\d{4}\.\d{2}\.\d{2}/g - ]; - - for (const pattern of datePatterns) { - const matches = allText.match(pattern); - if (matches && matches.length > 0) { - out.weldingDate = matches[0]; - console.log(`๐
Date found via pattern: "${out.weldingDate}"`); - break; - } - } - } - - // === STEP 4: ํ์ง ๊ฒ์ฆ ๋ฐ ํ์ฒ๋ฆฌ === - - // ์ถ์ถ๋ ๊ฐ๋ค ์ ๋ฆฌ - Object.keys(out).forEach(key => { - const value = (out as any)[key]; - if (typeof value === 'string' && value) { - (out as any)[key] = value.replace(/^[^\w]+|[^\w]+$/g, '').trim(); - } - }); - out.confidence = scoreRow(out); - - console.log(`๐ Final extracted row:`, out); - console.log(`๐ฏ Row confidence: ${out.confidence}`); - - // ์ต์ํ์ ๋ฐ์ดํฐ๊ฐ ์๋์ง ๊ฒ์ฆ - const hasAnyData = !!(out.no || out.identificationNo || out.tagNo || out.jointNo); - - if (!hasAnyData) { - console.log(`โ ๏ธ No meaningful data extracted from row`); - return null; - } - return out; } + /* -------------------------------------------------------------------------- */ /* Formatโ1 ์
ํ์ฑ */ /* -------------------------------------------------------------------------- */ function parseIdentificationData (txt: string): { identificationNo: string; jointNo: string; tagNo: string } { - console.log(`๐ Parsing identification data from: "${txt}"`); - const cleaned = clean(txt); - if (!cleaned) { - console.log(`โ Empty input text`); - return { identificationNo: '', jointNo: '', tagNo: '' }; - } + if (!cleaned) return { identificationNo: '', jointNo: '', tagNo: '' }; - console.log(`๐งน Cleaned text: "${cleaned}"`); - - const result = { identificationNo: '', jointNo: '', tagNo: '' }; - - // 1. Identification No ์ถ์ถ (ํ์ดํ์ด 2๊ฐ ์ด์ ํฌํจ๋ ํจํด) - const idPatterns = [ - /[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9\-]+/g, // ๊ธฐ๋ณธ ํจํด - /-\d+[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9]+/g, // ์์ ํ์ดํ์ด ์๋ ๊ฒฝ์ฐ - /\b[A-Za-z0-9]{2,}-[A-Za-z0-9]{2,}-[A-Za-z0-9]{2,}\b/g // ๋ ์๊ฒฉํ ํจํด - ]; - - for (const pattern of idPatterns) { - const matches = cleaned.match(pattern); - if (matches && matches.length > 0) { - // ๊ฐ์ฅ ๊ธด ๋งค์น๋ฅผ ์ ํ - result.identificationNo = matches.reduce((a, b) => a.length >= b.length ? a : b); - console.log(`๐ Found identification: "${result.identificationNo}"`); - break; - } - } - - // 2. Tag No ์ถ์ถ (7-8์๋ฆฌ ์ซ์) - const tagPatterns = [ - /\btag[:\s]*(\d{7,8})\b/i, // "tag: 1234567" ํํ - /\b(\d{7,8})\b/g // ๋จ์ 7-8์๋ฆฌ ์ซ์ - ]; - - for (const pattern of tagPatterns) { - const matches = cleaned.match(pattern); - if (matches) { - if (pattern.source.includes('tag')) { - result.tagNo = matches[1] || matches[0]; - } else { - // ๋ชจ๋ 7-8์๋ฆฌ ์ซ์๋ฅผ ์ฐพ์์ ๊ฐ์ฅ ์ ์ ํ ๊ฒ ์ ํ - const candidates = matches.filter(m => m && m.length >= 7 && m.length <= 8); - if (candidates.length > 0) { - result.tagNo = candidates[0]; - } - } - if (result.tagNo) { - console.log(`๐ท๏ธ Found tag: "${result.tagNo}"`); - break; - } - } - } - - // 3. Joint No ์ถ์ถ (๋๋จธ์ง ํ ํฐ ์ค์์) const tokens = cleaned.split(/\s+/).map(clean).filter(Boolean); - console.log(`๐ All tokens: [${tokens.join(', ')}]`); - - // ์ด๋ฏธ ์ฌ์ฉ๋ ํ ํฐ๋ค ์ ์ธ - const usedTokens = new Set([result.identificationNo, result.tagNo]); - const remainingTokens = tokens.filter(token => - !usedTokens.has(token) && - !result.identificationNo.includes(token) && - !result.tagNo.includes(token) && - token.length > 1 && - !/^(tag|joint|no|identification|์๋ณ|๋ฒํธ)$/i.test(token) - ); - - console.log(`๐ Remaining tokens for joint: [${remainingTokens.join(', ')}]`); - - if (remainingTokens.length > 0) { - // ๊ฐ์ฅ ์งง๊ณ ์ํ๋ฒณ+์ซ์ ์กฐํฉ์ธ ํ ํฐ์ Joint No๋ก ์ ํ - const jointCandidates = remainingTokens - .filter(token => /^[A-Za-z0-9]+$/.test(token) && token.length >= 2 && token.length <= 8) - .sort((a, b) => a.length - b.length); - - if (jointCandidates.length > 0) { - result.jointNo = jointCandidates[0]; - console.log(`๐ Found joint: "${result.jointNo}"`); - } else if (remainingTokens.length > 0) { - // ํ๋ณด๊ฐ ์์ผ๋ฉด ๊ฐ์ฅ ์งง์ ํ ํฐ ์ฌ์ฉ - result.jointNo = remainingTokens.reduce((a, b) => a.length <= b.length ? a : b); - console.log(`๐ Found joint (fallback): "${result.jointNo}"`); - } - } - - // 4. ๊ฒฐ๊ณผ ๊ฒ์ฆ ๋ฐ ์ ๋ฆฌ - Object.keys(result).forEach(key => { - const value = (result as any)[key]; - if (value && typeof value === 'string') { - (result as any)[key] = value.replace(/^[^\w]+|[^\w]+$/g, ''); // ์๋ค ํน์๋ฌธ์ ์ ๊ฑฐ - } - }); - - console.log(`๐ Final parsed result:`, result); - return result; + + // Identification ํ๋ณด: ํ์ดํ์ด 2๊ฐ ์ด์ ํฌํจ๋ ํ ํฐ ๊ฐ์ด๋ฐ ๊ฐ์ฅ ๊ธด ๊ฒ + const idCand = tokens.filter(t => t.split('-').length >= 3).sort((a, b) => b.length - a.length); + const identificationNo = idCand[0] || ''; + + const residual = tokens.filter(t => t !== identificationNo); + if (!residual.length) return { identificationNo, jointNo: '', tagNo: '' }; + + residual.sort((a, b) => a.length - b.length); + const jointNo = residual[0] || ''; + const tagNo = residual[residual.length - 1] || ''; + + return { identificationNo, jointNo, tagNo }; } + /* -------------------------------------------------------------------------- */ /* Helpers */ /* -------------------------------------------------------------------------- */ const clean = (s: string = '') => s.replace(/[\r\n\t]+/g, ' ').replace(/\s+/g, ' ').trim(); const isBlankRow = (row: string[]) => row.every(c => !clean(c)); -function isValidRow (r: ExtractedRow): boolean { - console.log(`โ
Validating row: no="${r.no}", id="${r.identificationNo}", tag="${r.tagNo}", joint="${r.jointNo}"`); - - // Level 1: ๊ธฐ์กด ์๊ฒฉํ ์กฐ๊ฑด - if (r.no && r.no.trim() || r.identificationNo && r.identificationNo.trim()) { - console.log(`โ
Level 1 validation passed (has no or identification)`); - return true; - } - - // Level 2: ์ํ๋ ์กฐ๊ฑด - ์ฃผ์ ํ๋ ์ค 2๊ฐ ์ด์ - const mainFields = [ - r.no?.trim(), - r.identificationNo?.trim(), - r.tagNo?.trim(), - r.jointNo?.trim() - ].filter(Boolean); - - if (mainFields.length >= 2) { - console.log(`โ
Level 2 validation passed (${mainFields.length} main fields present)`); - return true; - } - - // Level 3: ๋ ๊ด๋ํ ์กฐ๊ฑด - ์ด๋ค ํ๋๋ ํ๋๋ผ๋ ์๋ฏธ์๋ ๊ฐ - const allFields = [ - r.no?.trim(), - r.identificationNo?.trim(), - r.tagNo?.trim(), - r.jointNo?.trim(), - r.jointType?.trim(), - r.weldingDate?.trim() - ].filter(field => field && field.length > 1); // 1๊ธ์ ์ด์ - - if (allFields.length >= 1) { - console.log(`โ
Level 3 validation passed (${allFields.length} fields with meaningful content)`); - return true; - } - - console.log(`โ Validation failed - no meaningful content found`); - return false; -} +const isValidRow = (r: ExtractedRow) => !!(r.no || r.identificationNo); function scoreRow (r: ExtractedRow): number { const w: Record<keyof ExtractedRow, number> = { @@ -808,4 +267,4 @@ export function analyzeOCRQuality (ocrResult: any) { }); return { confidence: cnt ? conf / cnt : 0, tablesFound: tbl, textQuality: cnt ? kw / cnt : 0, keywordCount: kw }; -} +}
\ No newline at end of file |
