summaryrefslogtreecommitdiff
path: root/app/api/ocr/utils/tableExtraction.ts
diff options
context:
space:
mode:
Diffstat (limited to 'app/api/ocr/utils/tableExtraction.ts')
-rw-r--r--app/api/ocr/utils/tableExtraction.ts648
1 files changed, 596 insertions, 52 deletions
diff --git a/app/api/ocr/utils/tableExtraction.ts b/app/api/ocr/utils/tableExtraction.ts
index 720e5a5f..0a727f84 100644
--- a/app/api/ocr/utils/tableExtraction.ts
+++ b/app/api/ocr/utils/tableExtraction.ts
@@ -69,37 +69,107 @@ export async function extractTablesFromOCR (ocrResult: any): Promise<ExtractedRo
function isRelevantTable (table: OCRTable): boolean {
const headers = table.cells.filter(c => c.rowIndex < 3).map(getCellText).join(' ').toLowerCase();
- return /\bno\b|๋ฒˆํ˜ธ/.test(headers) && /identification|์‹๋ณ„|ident|id/.test(headers);
+ console.log(`๐Ÿ” Checking table relevance. Headers: "${headers}"`);
+
+ // ๊ธฐ์กด ์กฐ๊ฑด
+ const hasNoColumn = /\bno\b|๋ฒˆํ˜ธ/.test(headers);
+ const hasIdentification = /identification|์‹๋ณ„|ident|id/.test(headers);
+
+ console.log(`๐Ÿ“ Has NO column: ${hasNoColumn}`);
+ console.log(`๐Ÿ“ Has Identification: ${hasIdentification}`);
+
+ // ๊ธฐ๋ณธ ์กฐ๊ฑด
+ if (hasNoColumn && hasIdentification) {
+ console.log(`โœ… Table passes strict criteria`);
+ return true;
+ }
+
+ // ์™„ํ™”๋œ ์กฐ๊ฑด๋“ค
+ const relaxedConditions = [
+ // ์กฐ๊ฑด 1: ํ…Œ์ด๋ธ”์— ์—ฌ๋Ÿฌ ์—ด์ด ์žˆ๊ณ  ์ˆซ์ž๋‚˜ ์‹๋ณ„์ž ํŒจํ„ด์ด ๋ณด์ด๋Š” ๊ฒฝ์šฐ
+ table.cells.length > 10 && /\d+/.test(headers),
+
+ // ์กฐ๊ฑด 2: joint, tag, weld ๋“ฑ ๊ด€๋ จ ํ‚ค์›Œ๋“œ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ
+ /joint|tag|weld|type|date/.test(headers),
+
+ // ์กฐ๊ฑด 3: ์‹๋ณ„๋ฒˆํ˜ธ ํŒจํ„ด์ด ๋ณด์ด๋Š” ๊ฒฝ์šฐ (ํ•˜์ดํ”ˆ์ด ํฌํ•จ๋œ ๋ฌธ์ž์—ด)
+ headers.includes('-') && headers.length > 20,
+
+ // ์กฐ๊ฑด 4: ํ•œ๊ตญ์–ด ๊ด€๋ จ ํ‚ค์›Œ๋“œ
+ /์šฉ์ ‘|์กฐ์ธํŠธ|ํƒœ๊ทธ/.test(headers)
+ ];
+
+ const passedConditions = relaxedConditions.filter(Boolean).length;
+ console.log(`๐Ÿ“Š Relaxed conditions passed: ${passedConditions}/${relaxedConditions.length}`);
+
+ if (passedConditions >= 1) {
+ console.log(`โœ… Table passes relaxed criteria`);
+ return true;
+ }
+
+ console.log(`โŒ Table does not meet any criteria`);
+ return false;
}
-
/* -------------------------------------------------------------------------- */
/* ํ‘œ ํ•ด์„ */
/* -------------------------------------------------------------------------- */
function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): ExtractedRow[] {
+ console.log(`๐Ÿ”ง Starting extractTableData for table ${imgIdx}-${tblIdx}`);
+
const grid = buildGrid(table);
+ console.log(`๐Ÿ“Š Grid size: ${grid.length} rows x ${grid[0]?.length || 0} columns`);
+
const headerRowIdx = findHeaderRow(grid);
- if (headerRowIdx === -1) return [];
+ console.log(`๐Ÿ“ Header row index: ${headerRowIdx}`);
- const format = detectFormat(grid[headerRowIdx]);
- const mapping = mapColumns(grid[headerRowIdx]);
+ if (headerRowIdx === -1) {
+ console.log(`โŒ No header row found`);
+ return [];
+ }
+
+ const format = detectFormat(grid[headerRowIdx]);
+ const mapping = mapColumns(grid[headerRowIdx]);
+
+ console.log(`๐Ÿ“‹ Detected format: ${format}`);
+ console.log(`๐Ÿ—‚๏ธ Column mapping:`, mapping);
const seen = new Set<string>();
const data: ExtractedRow[] = [];
for (let r = headerRowIdx + 1; r < grid.length; r++) {
const row = grid[r];
- if (isBlankRow(row)) continue;
+
+ if (isBlankRow(row)) {
+ console.log(`โญ๏ธ Row ${r}: blank, skipping`);
+ continue;
+ }
+
+ console.log(`๐Ÿ” Processing row ${r}: [${row.join(' | ')}]`);
const parsed = buildRow(row, format, mapping, tblIdx, r);
- if (!parsed || !isValidRow(parsed)) continue;
+ if (!parsed) {
+ console.log(`โŒ Row ${r}: failed to parse`);
+ continue;
+ }
+
+ if (!isValidRow(parsed)) {
+ console.log(`โŒ Row ${r}: invalid (no: "${parsed.no}", id: "${parsed.identificationNo}")`);
+ continue;
+ }
const key = `${parsed.no}-${parsed.identificationNo}`;
- if (seen.has(key)) continue;
+ if (seen.has(key)) {
+ console.log(`โš ๏ธ Row ${r}: duplicate key "${key}", skipping`);
+ continue;
+ }
+
seen.add(key);
-
data.push(parsed);
+ console.log(`โœ… Row ${r}: added (${JSON.stringify(parsed)})`);
}
+
+ console.log(`๐ŸŽฏ Table ${imgIdx}-${tblIdx}: extracted ${data.length} valid rows`);
return data;
}
@@ -108,18 +178,39 @@ function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): Ext
/* -------------------------------------------------------------------------- */
function buildGrid (table: OCRTable): string[][] {
+ console.log(`๐Ÿ”ง Building grid from ${table.cells.length} cells`);
+
const maxR = Math.max(...table.cells.map(c => c.rowIndex + c.rowSpan - 1));
const maxC = Math.max(...table.cells.map(c => c.columnIndex + c.columnSpan - 1));
+
+ console.log(`๐Ÿ“Š Grid dimensions: ${maxR + 1} rows x ${maxC + 1} columns`);
+
const grid = Array.from({ length: maxR + 1 }, () => Array(maxC + 1).fill(''));
- table.cells.forEach(cell => {
+ // ์…€๋ณ„ ์ƒ์„ธ ์ •๋ณด ์ถœ๋ ฅ
+ table.cells.forEach((cell, idx) => {
const txt = getCellText(cell);
+ console.log(`๐Ÿ“ฑ Cell ${idx}: (${cell.rowIndex},${cell.columnIndex}) span(${cell.rowSpan},${cell.columnSpan}) = "${txt}"`);
+
for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) {
for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) {
- grid[r][c] = grid[r][c] ? `${grid[r][c]} ${txt}` : txt;
+ const oldValue = grid[r][c];
+ const newValue = oldValue ? `${oldValue} ${txt}` : txt;
+ grid[r][c] = newValue;
+
+ if (oldValue) {
+ console.log(`๐Ÿ”„ Grid[${r}][${c}]: "${oldValue}" โ†’ "${newValue}"`);
+ }
}
}
});
+
+ // ์ตœ์ข… ๊ทธ๋ฆฌ๋“œ ์ถœ๋ ฅ
+ console.log(`๐Ÿ“‹ Final grid:`);
+ grid.forEach((row, r) => {
+ console.log(` Row ${r}: [${row.map(cell => `"${cell}"`).join(', ')}]`);
+ });
+
return grid;
}
@@ -128,13 +219,52 @@ function getCellText (cell: TableCell): string {
}
function findHeaderRow (grid: string[][]): number {
+ console.log(`๐Ÿ” Finding header row in grid with ${grid.length} rows`);
+
+ for (let i = 0; i < Math.min(5, grid.length); i++) {
+ const rowText = grid[i].join(' ').toLowerCase();
+ console.log(`๐Ÿ“ Row ${i}: "${rowText}"`);
+
+ // ๊ธฐ์กด ์—„๊ฒฉํ•œ ์กฐ๊ฑด
+ if (/\bno\b|๋ฒˆํ˜ธ/.test(rowText) && /identification|์‹๋ณ„|ident/.test(rowText)) {
+ console.log(`โœ… Row ${i}: Strict match`);
+ return i;
+ }
+
+ // ์™„ํ™”๋œ ์กฐ๊ฑด๋“ค
+ const relaxedMatches = [
+ // 1. NO ์ปฌ๋Ÿผ + ๋‹ค๋ฅธ ๊ด€๋ จ ํ‚ค์›Œ๋“œ
+ (/\bno\b|๋ฒˆํ˜ธ/.test(rowText) && /joint|tag|type|weld|date/.test(rowText)),
+
+ // 2. ID/์‹๋ณ„ + ๋‹ค๋ฅธ ๊ด€๋ จ ํ‚ค์›Œ๋“œ
+ (/identification|์‹๋ณ„|ident|id/.test(rowText) && /joint|tag|no|type/.test(rowText)),
+
+ // 3. ์šฉ์ ‘ ๊ด€๋ จ ํ‚ค์›Œ๋“œ๊ฐ€ ์—ฌ๋Ÿฌ ๊ฐœ
+ (rowText.match(/joint|tag|type|weld|date|no|id|์‹๋ณ„|๋ฒˆํ˜ธ|์šฉ์ ‘/g)?.length >= 3),
+
+ // 4. ์ฒซ ๋ฒˆ์งธ ํ–‰์ด๊ณ  ์—ฌ๋Ÿฌ ๋‹จ์–ด๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ
+ (i === 0 && rowText.split(/\s+/).filter(w => w.length > 1).length >= 3)
+ ];
+
+ if (relaxedMatches.some(Boolean)) {
+ console.log(`โœ… Row ${i}: Relaxed match`);
+ return i;
+ }
+
+ console.log(`โŒ Row ${i}: No match`);
+ }
+
+ // ์ตœํ›„์˜ ์ˆ˜๋‹จ: ์ฒซ ๋ฒˆ์งธ ๋น„์–ด์žˆ์ง€ ์•Š์€ ํ–‰
for (let i = 0; i < Math.min(3, grid.length); i++) {
- const t = grid[i].join(' ').toLowerCase();
- if (/\bno\b|๋ฒˆํ˜ธ/.test(t) && /identification|์‹๋ณ„|ident/.test(t)) return i;
+ if (grid[i].some(cell => cell.trim().length > 0)) {
+ console.log(`โš ๏ธ Using row ${i} as fallback header`);
+ return i;
+ }
}
+
+ console.log(`โŒ No header row found`);
return -1;
}
-
/* -------------------------------------------------------------------------- */
/* Column Mapping */
/* -------------------------------------------------------------------------- */
@@ -146,19 +276,153 @@ function detectFormat (header: string[]): 'format1' | 'format2' {
function mapColumns (header: string[]): ColumnMapping {
const mp: ColumnMapping = { no: -1, identification: -1, tagNo: -1, jointNo: -1, jointType: -1, weldingDate: -1 };
+
+ console.log(`๐Ÿ—‚๏ธ Smart mapping columns from header: [${header.map(h => `"${h}"`).join(', ')}]`);
+ // === STEP 1: ๊ธฐ์กด ๊ฐœ๋ณ„ ์ปฌ๋Ÿผ ๋งคํ•‘ ===
header.forEach((h, i) => {
- const t = h.toLowerCase();
- if (/^no\.?$/.test(t) && !/ident|tag|joint/.test(t)) mp.no = i;
- else if (/identification|ident/.test(t)) mp.identification = i;
- else if (/tag.*no/.test(t)) mp.tagNo = i;
- else if (/joint.*no/.test(t)) mp.jointNo = i;
- else if (/joint.*type/.test(t) || (/^type$/.test(t) && mp.jointType === -1)) mp.jointType = i;
- else if (/welding|date/.test(t)) mp.weldingDate = i;
+ const t = h.toLowerCase().trim();
+ console.log(`๐Ÿ“‹ Column ${i}: "${h}" โ†’ "${t}"`);
+
+ if (mp.no === -1 && (/^no\.?$/i.test(t) || /^๋ฒˆํ˜ธ$/i.test(t) || /^์ˆœ๋ฒˆ$/i.test(t))) {
+ mp.no = i;
+ console.log(`โœ… NO column (individual) mapped to index ${i}`);
+ }
+
+ if (mp.identification === -1 && (/identification.*no/i.test(t) || /์‹๋ณ„.*๋ฒˆํ˜ธ/i.test(t))) {
+ mp.identification = i;
+ console.log(`โœ… Identification column (individual) mapped to index ${i}`);
+ }
+
+ if (mp.tagNo === -1 && (/tag.*no/i.test(t) || /ํƒœ๊ทธ.*๋ฒˆํ˜ธ/i.test(t))) {
+ mp.tagNo = i;
+ console.log(`โœ… Tag No column (individual) mapped to index ${i}`);
+ }
+
+ if (mp.jointNo === -1 && (/joint.*no/i.test(t) || /์กฐ์ธํŠธ.*๋ฒˆํ˜ธ/i.test(t) || /oint.*no/i.test(t))) {
+ mp.jointNo = i;
+ console.log(`โœ… Joint No column (individual) mapped to index ${i}`);
+ }
+
+ if (mp.jointType === -1 && (/joint.*type/i.test(t) || /^type$/i.test(t) || /ํ˜•ํƒœ/i.test(t))) {
+ mp.jointType = i;
+ console.log(`โœ… Joint Type column (individual) mapped to index ${i}`);
+ }
+
+ if (mp.weldingDate === -1 && (/welding.*date/i.test(t) || /weld.*date/i.test(t) || /^date$/i.test(t) || /๋‚ ์งœ/i.test(t))) {
+ mp.weldingDate = i;
+ console.log(`โœ… Welding Date column (individual) mapped to index ${i}`);
+ }
+ });
+
+ // === STEP 2: ์‹ค์šฉ์  ์ถ”๋ก  ===
+ console.log(`๐Ÿค– Starting practical column inference...`);
+
+ // NO ์ปฌ๋Ÿผ์ด ๋งคํ•‘๋˜์ง€ ์•Š์•˜๋‹ค๋ฉด, ์ฒซ ๋ฒˆ์งธ ์ปฌ๋Ÿผ์„ NO๋กœ ์ถ”์ •
+ if (mp.no === -1) {
+ mp.no = 0;
+ console.log(`๐Ÿ”ฎ NO column inferred as index 0 (first column)`);
+ }
+
+ // Identification ์ปฌ๋Ÿผ ์ฐพ๊ธฐ - "identification" ํ‚ค์›Œ๋“œ๊ฐ€ ํฌํ•จ๋œ ์ปฌ๋Ÿผ ์ค‘์—์„œ
+ if (mp.identification === -1) {
+ for (let i = 0; i < header.length; i++) {
+ const text = header[i].toLowerCase();
+ if (text.includes('identification') || text.includes('์‹๋ณ„')) {
+ mp.identification = i;
+ console.log(`๐Ÿ†” Identification column found at index ${i}`);
+ break;
+ }
+ }
+ }
+
+ // Tag No ์ปฌ๋Ÿผ ์ฐพ๊ธฐ - "tag" ํ‚ค์›Œ๋“œ๊ฐ€ ํฌํ•จ๋œ ์ปฌ๋Ÿผ ์ค‘์—์„œ
+ if (mp.tagNo === -1) {
+ for (let i = 0; i < header.length; i++) {
+ const text = header[i].toLowerCase();
+ if (text.includes('tag') && !text.includes('no')) {
+ mp.tagNo = i;
+ console.log(`๐Ÿท๏ธ Tag column found at index ${i}`);
+ break;
+ }
+ }
+ }
+
+ // Joint No ์ปฌ๋Ÿผ ์ฐพ๊ธฐ
+ if (mp.jointNo === -1) {
+ for (let i = 0; i < header.length; i++) {
+ const text = header[i].toLowerCase();
+ if (text.includes('joint') || text.includes('oint')) {
+ mp.jointNo = i;
+ console.log(`๐Ÿ”— Joint column found at index ${i}`);
+ break;
+ }
+ }
+ }
+
+ // === STEP 3: ํŒจํ„ด ๊ธฐ๋ฐ˜ ์ถ”๋ก  (๋งˆ์ง€๋ง‰ ์ˆ˜๋‹จ) ===
+ console.log(`๐ŸŽฏ Pattern-based fallback mapping...`);
+
+ // ์ „์ฒด ํ—ค๋”์—์„œ ์‹ค์ œ ์‹๋ณ„๋ฒˆํ˜ธ ํŒจํ„ด์ด ์žˆ๋Š” ์ปฌ๋Ÿผ ์ฐพ๊ธฐ
+ if (mp.identification === -1) {
+ for (let i = 0; i < header.length; i++) {
+ const text = header[i];
+ // ํ•˜์ดํ”ˆ์ด ํฌํ•จ๋œ ๊ธด ๋ฌธ์ž์—ด์ด ์žˆ๋Š” ์ปฌ๋Ÿผ
+ if (text.includes('-') && text.length > 15) {
+ mp.identification = i;
+ console.log(`๐Ÿ†” Identification inferred at index ${i} (contains ID pattern)`);
+ break;
+ }
+ }
+ }
+
+ // ์ˆซ์ž ํŒจํ„ด์ด ์žˆ๋Š” ์ปฌ๋Ÿผ์„ Tag No๋กœ ์ถ”์ •
+ if (mp.tagNo === -1) {
+ for (let i = 1; i < header.length; i++) { // ์ฒซ ๋ฒˆ์งธ ์ปฌ๋Ÿผ ์ œ์™ธ
+ const text = header[i];
+ // 7-8์ž๋ฆฌ ์ˆซ์ž๊ฐ€ ์žˆ๋Š” ์ปฌ๋Ÿผ
+ if (/\d{7,8}/.test(text)) {
+ mp.tagNo = i;
+ console.log(`๐Ÿท๏ธ Tag No inferred at index ${i} (contains number pattern)`);
+ break;
+ }
+ }
+ }
+
+ // === STEP 4: ๊ธฐ๋ณธ๊ฐ’ ์„ค์ • ===
+ console.log(`๐Ÿ”ง Setting default values for unmapped columns...`);
+
+ // ์—ฌ์ „ํžˆ ๋งคํ•‘๋˜์ง€ ์•Š์€ ์ค‘์š”ํ•œ ์ปฌ๋Ÿผ๋“ค์— ๋Œ€ํ•ด ์ˆœ์„œ ๊ธฐ๋ฐ˜ ์ถ”์ •
+ const essentialColumns = [
+ { key: 'identification', currentValue: mp.identification, defaultIndex: 1 },
+ { key: 'tagNo', currentValue: mp.tagNo, defaultIndex: 2 },
+ { key: 'jointNo', currentValue: mp.jointNo, defaultIndex: 3 },
+ { key: 'jointType', currentValue: mp.jointType, defaultIndex: 4 },
+ { key: 'weldingDate', currentValue: mp.weldingDate, defaultIndex: Math.min(5, header.length - 1) }
+ ];
+
+ essentialColumns.forEach(col => {
+ if ((col.currentValue as number) === -1 && col.defaultIndex < header.length) {
+ (mp as any)[col.key] = col.defaultIndex;
+ console.log(`๐Ÿ”ง ${col.key} set to default index ${col.defaultIndex}`);
+ }
});
+
+ console.log(`๐ŸŽฏ Final optimized column mapping:`, mp);
+
+ // === STEP 5: ๋งคํ•‘ ํ’ˆ์งˆ ๊ฒ€์ฆ ===
+ const mappedCount = Object.values(mp).filter(v => v !== -1).length;
+ const totalColumns = Object.keys(mp).length;
+ const mappingQuality = mappedCount / totalColumns;
+
+ console.log(`๐Ÿ“Š Mapping quality: ${mappedCount}/${totalColumns} (${(mappingQuality * 100).toFixed(1)}%)`);
+
+ if (mappingQuality < 0.5) {
+ console.warn(`โš ๏ธ Low mapping quality detected. Consider manual adjustment.`);
+ }
+
return mp;
}
-
/* -------------------------------------------------------------------------- */
/* Row Extraction */
/* -------------------------------------------------------------------------- */
@@ -170,71 +434,351 @@ function buildRow (
tblIdx: number,
rowIdx: number
): ExtractedRow | null {
+ console.log(`๐Ÿ”จ Building row from: [${row.map(r => `"${r}"`).join(', ')}]`);
+ console.log(`๐Ÿ“‹ Using mapping:`, mp);
+ console.log(`๐Ÿ“„ Format: ${format}`);
+
const out: ExtractedRow = {
- no: mp.no >= 0 ? clean(row[mp.no]) : '',
+ no: '',
identificationNo: '',
tagNo: '',
jointNo: '',
- jointType: mp.jointType >= 0 ? clean(row[mp.jointType]) : '',
+ jointType: '',
weldingDate: '',
confidence: 0,
sourceTable: tblIdx,
sourceRow: rowIdx,
};
- if (mp.weldingDate >= 0) out.weldingDate = clean(row[mp.weldingDate]);
- else {
- const idx = row.findIndex(col => /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/.test(col));
- if (idx >= 0) out.weldingDate = clean(row[idx]);
+ // === STEP 1: ๋งคํ•‘๋œ ์ปฌ๋Ÿผ์—์„œ ๊ธฐ๋ณธ ์ถ”์ถœ ===
+
+ // NO ์ปฌ๋Ÿผ ์ถ”์ถœ
+ if (mp.no >= 0 && mp.no < row.length) {
+ const rawNo = clean(row[mp.no]);
+ // NO ํ•„๋“œ์—์„œ ์ฒซ ๋ฒˆ์งธ ์ˆซ์ž ํŒจํ„ด ์ถ”์ถœ
+ const noMatch = rawNo.match(/\b(\d{2,4})\b/);
+ out.no = noMatch ? noMatch[1] : rawNo;
+ console.log(`๐Ÿ“ NO from column ${mp.no}: "${out.no}" (raw: "${rawNo}")`);
+ }
+
+ // Joint Type, Welding Date๋Š” ๊ธฐ์กด๋Œ€๋กœ
+ if (mp.jointType >= 0 && mp.jointType < row.length) {
+ out.jointType = clean(row[mp.jointType]);
+ console.log(`๐Ÿ”— Joint Type from column ${mp.jointType}: "${out.jointType}"`);
}
+ if (mp.weldingDate >= 0 && mp.weldingDate < row.length) {
+ out.weldingDate = clean(row[mp.weldingDate]);
+ console.log(`๐Ÿ“… Welding Date from column ${mp.weldingDate}: "${out.weldingDate}"`);
+ }
+
+ // === STEP 2: Format๋ณ„ ๋ฐ์ดํ„ฐ ์ถ”์ถœ ===
+
if (format === 'format2') {
- if (mp.identification >= 0) out.identificationNo = clean(row[mp.identification]);
- if (mp.jointNo >= 0) out.jointNo = clean(row[mp.jointNo]);
- if (mp.tagNo >= 0) out.tagNo = clean(row[mp.tagNo]);
+ console.log(`๐Ÿ“„ Processing Format 2 (separate columns)`);
+
+ if (mp.identification >= 0 && mp.identification < row.length) {
+ out.identificationNo = clean(row[mp.identification]);
+ console.log(`๐Ÿ†” Identification from column ${mp.identification}: "${out.identificationNo}"`);
+ }
+
+ if (mp.jointNo >= 0 && mp.jointNo < row.length) {
+ out.jointNo = clean(row[mp.jointNo]);
+ console.log(`๐Ÿ”— Joint No from column ${mp.jointNo}: "${out.jointNo}"`);
+ }
+
+ if (mp.tagNo >= 0 && mp.tagNo < row.length) {
+ out.tagNo = clean(row[mp.tagNo]);
+ console.log(`๐Ÿท๏ธ Tag No from column ${mp.tagNo}: "${out.tagNo}"`);
+ }
} else {
- const combined = mp.identification >= 0 ? row[mp.identification] : '';
- const parsed = parseIdentificationData(combined);
+ console.log(`๐Ÿ“„ Processing Format 1 (combined identification column)`);
+
+ let combinedText = '';
+
+ // ๋งคํ•‘๋œ identification ์ปฌ๋Ÿผ์—์„œ ํ…์ŠคํŠธ ๊ฐ€์ ธ์˜ค๊ธฐ
+ if (mp.identification >= 0 && mp.identification < row.length) {
+ combinedText = row[mp.identification];
+ console.log(`๐Ÿ†” Combined text from column ${mp.identification}: "${combinedText}"`);
+ }
+
+ const parsed = parseIdentificationData(combinedText);
out.identificationNo = parsed.identificationNo;
- out.jointNo = parsed.jointNo;
- out.tagNo = parsed.tagNo;
+ out.jointNo = parsed.jointNo;
+ out.tagNo = parsed.tagNo;
+
+ console.log(`๐Ÿ“Š Parsed from identification column:`, parsed);
}
+ // === STEP 3: ์ ๊ทน์  ํŒจํ„ด ๋งค์นญ์œผ๋กœ ๋ˆ„๋ฝ๋œ ํ•„๋“œ ์ฑ„์šฐ๊ธฐ ===
+ console.log(`๐Ÿ” Aggressive pattern matching for missing fields...`);
+
+ const allText = row.join(' ');
+ console.log(`๐Ÿ“ Full row text: "${allText}"`);
+
+ // NO ํ•„๋“œ๊ฐ€ ๋น„์–ด์žˆ๋‹ค๋ฉด ์ฒซ ๋ฒˆ์งธ ์ปฌ๋Ÿผ์—์„œ ์ˆซ์ž ํŒจํ„ด ์ฐพ๊ธฐ
+ if (!out.no && row.length > 0) {
+ const firstCol = clean(row[0]);
+ const noPatterns = [
+ /\b(\d{3})\b/g, // 3์ž๋ฆฌ ์ˆซ์ž
+ /\b(\d{2,4})\b/g, // 2-4์ž๋ฆฌ ์ˆซ์ž
+ /^(\d+)/ // ๋งจ ์•ž ์ˆซ์ž
+ ];
+
+ for (const pattern of noPatterns) {
+ const matches = firstCol.match(pattern);
+ if (matches && matches.length > 0) {
+ out.no = matches[0].replace(/\D/g, ''); // ์ˆซ์ž๋งŒ ์ถ”์ถœ
+ console.log(`๐Ÿ“ NO found via pattern in first column: "${out.no}"`);
+ break;
+ }
+ }
+ }
+
+ // Identification No ํŒจํ„ด ์ฐพ๊ธฐ (ํ•˜์ดํ”ˆ์ด ํฌํ•จ๋œ ๊ธด ๋ฌธ์ž์—ด)
+ if (!out.identificationNo) {
+ const idPatterns = [
+ /[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9\-]+/g,
+ /-\d+[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9]+/g,
+ /\b[A-Z]\d+[A-Z]-\d+-\d+-[A-Z]+-\d+-[A-Z0-9]+-[A-Z]-[A-Z0-9]+\b/g
+ ];
+
+ for (const pattern of idPatterns) {
+ const matches = allText.match(pattern);
+ if (matches && matches.length > 0) {
+ out.identificationNo = matches[0];
+ console.log(`๐Ÿ†” Identification found via pattern: "${out.identificationNo}"`);
+ break;
+ }
+ }
+ }
+
+ // Tag No ํŒจํ„ด ์ฐพ๊ธฐ (7-8์ž๋ฆฌ ์ˆซ์ž)
+ if (!out.tagNo) {
+ const tagMatches = allText.match(/\b\d{7,8}\b/g);
+ if (tagMatches && tagMatches.length > 0) {
+ out.tagNo = tagMatches[0];
+ console.log(`๐Ÿท๏ธ Tag found via pattern: "${out.tagNo}"`);
+ }
+ }
+
+ // Joint No ํŒจํ„ด ์ฐพ๊ธฐ (์งง์€ ์˜์ˆซ์ž ์กฐํ•ฉ)
+ if (!out.jointNo) {
+ const jointPatterns = [
+ /\b[A-Z]{2,4}\d*\b/g, // ๋Œ€๋ฌธ์ž+์ˆซ์ž ์กฐํ•ฉ
+ /\b[A-Za-z0-9]{2,6}\b/g // ์ผ๋ฐ˜์ ์ธ ์งง์€ ์กฐํ•ฉ
+ ];
+
+ for (const pattern of jointPatterns) {
+ const matches = allText.match(pattern);
+ if (matches) {
+ const candidates = matches.filter(m =>
+ m !== out.no &&
+ m !== out.tagNo &&
+ m !== out.identificationNo &&
+ m.length >= 2 && m.length <= 6 &&
+ !/^(no|tag|joint|type|date|welding|project|samsung|class)$/i.test(m)
+ );
+
+ if (candidates.length > 0) {
+ out.jointNo = candidates[0];
+ console.log(`๐Ÿ”— Joint found via pattern: "${out.jointNo}"`);
+ break;
+ }
+ }
+ }
+ }
+
+ // Welding Date ํŒจํ„ด ์ฐพ๊ธฐ
+ if (!out.weldingDate) {
+ const datePatterns = [
+ /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/g,
+ /\d{4}\.\d{2}\.\d{2}/g
+ ];
+
+ for (const pattern of datePatterns) {
+ const matches = allText.match(pattern);
+ if (matches && matches.length > 0) {
+ out.weldingDate = matches[0];
+ console.log(`๐Ÿ“… Date found via pattern: "${out.weldingDate}"`);
+ break;
+ }
+ }
+ }
+
+ // === STEP 4: ํ’ˆ์งˆ ๊ฒ€์ฆ ๋ฐ ํ›„์ฒ˜๋ฆฌ ===
+
+ // ์ถ”์ถœ๋œ ๊ฐ’๋“ค ์ •๋ฆฌ
+ Object.keys(out).forEach(key => {
+ const value = (out as any)[key];
+ if (typeof value === 'string' && value) {
+ (out as any)[key] = value.replace(/^[^\w]+|[^\w]+$/g, '').trim();
+ }
+ });
+
out.confidence = scoreRow(out);
+
+ console.log(`๐Ÿ“Š Final extracted row:`, out);
+ console.log(`๐ŸŽฏ Row confidence: ${out.confidence}`);
+
+ // ์ตœ์†Œํ•œ์˜ ๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ๋Š”์ง€ ๊ฒ€์ฆ
+ const hasAnyData = !!(out.no || out.identificationNo || out.tagNo || out.jointNo);
+
+ if (!hasAnyData) {
+ console.log(`โš ๏ธ No meaningful data extracted from row`);
+ return null;
+ }
+
return out;
}
-
/* -------------------------------------------------------------------------- */
/* Formatโ€‘1 ์…€ ํŒŒ์‹ฑ */
/* -------------------------------------------------------------------------- */
function parseIdentificationData (txt: string): { identificationNo: string; jointNo: string; tagNo: string } {
+ console.log(`๐Ÿ” Parsing identification data from: "${txt}"`);
+
const cleaned = clean(txt);
- if (!cleaned) return { identificationNo: '', jointNo: '', tagNo: '' };
+ if (!cleaned) {
+ console.log(`โŒ Empty input text`);
+ return { identificationNo: '', jointNo: '', tagNo: '' };
+ }
+ console.log(`๐Ÿงน Cleaned text: "${cleaned}"`);
+
+ const result = { identificationNo: '', jointNo: '', tagNo: '' };
+
+ // 1. Identification No ์ถ”์ถœ (ํ•˜์ดํ”ˆ์ด 2๊ฐœ ์ด์ƒ ํฌํ•จ๋œ ํŒจํ„ด)
+ const idPatterns = [
+ /[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9\-]+/g, // ๊ธฐ๋ณธ ํŒจํ„ด
+ /-\d+[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9]+/g, // ์•ž์— ํ•˜์ดํ”ˆ์ด ์žˆ๋Š” ๊ฒฝ์šฐ
+ /\b[A-Za-z0-9]{2,}-[A-Za-z0-9]{2,}-[A-Za-z0-9]{2,}\b/g // ๋” ์—„๊ฒฉํ•œ ํŒจํ„ด
+ ];
+
+ for (const pattern of idPatterns) {
+ const matches = cleaned.match(pattern);
+ if (matches && matches.length > 0) {
+ // ๊ฐ€์žฅ ๊ธด ๋งค์น˜๋ฅผ ์„ ํƒ
+ result.identificationNo = matches.reduce((a, b) => a.length >= b.length ? a : b);
+ console.log(`๐Ÿ†” Found identification: "${result.identificationNo}"`);
+ break;
+ }
+ }
+
+ // 2. Tag No ์ถ”์ถœ (7-8์ž๋ฆฌ ์ˆซ์ž)
+ const tagPatterns = [
+ /\btag[:\s]*(\d{7,8})\b/i, // "tag: 1234567" ํ˜•ํƒœ
+ /\b(\d{7,8})\b/g // ๋‹จ์ˆœ 7-8์ž๋ฆฌ ์ˆซ์ž
+ ];
+
+ for (const pattern of tagPatterns) {
+ const matches = cleaned.match(pattern);
+ if (matches) {
+ if (pattern.source.includes('tag')) {
+ result.tagNo = matches[1] || matches[0];
+ } else {
+ // ๋ชจ๋“  7-8์ž๋ฆฌ ์ˆซ์ž๋ฅผ ์ฐพ์•„์„œ ๊ฐ€์žฅ ์ ์ ˆํ•œ ๊ฒƒ ์„ ํƒ
+ const candidates = matches.filter(m => m && m.length >= 7 && m.length <= 8);
+ if (candidates.length > 0) {
+ result.tagNo = candidates[0];
+ }
+ }
+ if (result.tagNo) {
+ console.log(`๐Ÿท๏ธ Found tag: "${result.tagNo}"`);
+ break;
+ }
+ }
+ }
+
+ // 3. Joint No ์ถ”์ถœ (๋‚˜๋จธ์ง€ ํ† ํฐ ์ค‘์—์„œ)
const tokens = cleaned.split(/\s+/).map(clean).filter(Boolean);
-
- // Identification ํ›„๋ณด: ํ•˜์ดํ”ˆ์ด 2๊ฐœ ์ด์ƒ ํฌํ•จ๋œ ํ† ํฐ ๊ฐ€์šด๋ฐ ๊ฐ€์žฅ ๊ธด ๊ฒƒ
- const idCand = tokens.filter(t => t.split('-').length >= 3).sort((a, b) => b.length - a.length);
- const identificationNo = idCand[0] || '';
-
- const residual = tokens.filter(t => t !== identificationNo);
- if (!residual.length) return { identificationNo, jointNo: '', tagNo: '' };
-
- residual.sort((a, b) => a.length - b.length);
- const jointNo = residual[0] || '';
- const tagNo = residual[residual.length - 1] || '';
-
- return { identificationNo, jointNo, tagNo };
+ console.log(`๐Ÿ“ All tokens: [${tokens.join(', ')}]`);
+
+ // ์ด๋ฏธ ์‚ฌ์šฉ๋œ ํ† ํฐ๋“ค ์ œ์™ธ
+ const usedTokens = new Set([result.identificationNo, result.tagNo]);
+ const remainingTokens = tokens.filter(token =>
+ !usedTokens.has(token) &&
+ !result.identificationNo.includes(token) &&
+ !result.tagNo.includes(token) &&
+ token.length > 1 &&
+ !/^(tag|joint|no|identification|์‹๋ณ„|๋ฒˆํ˜ธ)$/i.test(token)
+ );
+
+ console.log(`๐Ÿ”„ Remaining tokens for joint: [${remainingTokens.join(', ')}]`);
+
+ if (remainingTokens.length > 0) {
+ // ๊ฐ€์žฅ ์งง๊ณ  ์•ŒํŒŒ๋ฒณ+์ˆซ์ž ์กฐํ•ฉ์ธ ํ† ํฐ์„ Joint No๋กœ ์„ ํƒ
+ const jointCandidates = remainingTokens
+ .filter(token => /^[A-Za-z0-9]+$/.test(token) && token.length >= 2 && token.length <= 8)
+ .sort((a, b) => a.length - b.length);
+
+ if (jointCandidates.length > 0) {
+ result.jointNo = jointCandidates[0];
+ console.log(`๐Ÿ”— Found joint: "${result.jointNo}"`);
+ } else if (remainingTokens.length > 0) {
+ // ํ›„๋ณด๊ฐ€ ์—†์œผ๋ฉด ๊ฐ€์žฅ ์งง์€ ํ† ํฐ ์‚ฌ์šฉ
+ result.jointNo = remainingTokens.reduce((a, b) => a.length <= b.length ? a : b);
+ console.log(`๐Ÿ”— Found joint (fallback): "${result.jointNo}"`);
+ }
+ }
+
+ // 4. ๊ฒฐ๊ณผ ๊ฒ€์ฆ ๋ฐ ์ •๋ฆฌ
+ Object.keys(result).forEach(key => {
+ const value = (result as any)[key];
+ if (value && typeof value === 'string') {
+ (result as any)[key] = value.replace(/^[^\w]+|[^\w]+$/g, ''); // ์•ž๋’ค ํŠน์ˆ˜๋ฌธ์ž ์ œ๊ฑฐ
+ }
+ });
+
+ console.log(`๐Ÿ“Š Final parsed result:`, result);
+ return result;
}
-
/* -------------------------------------------------------------------------- */
/* Helpers */
/* -------------------------------------------------------------------------- */
const clean = (s: string = '') => s.replace(/[\r\n\t]+/g, ' ').replace(/\s+/g, ' ').trim();
const isBlankRow = (row: string[]) => row.every(c => !clean(c));
-const isValidRow = (r: ExtractedRow) => !!(r.no || r.identificationNo);
+function isValidRow (r: ExtractedRow): boolean {
+ console.log(`โœ… Validating row: no="${r.no}", id="${r.identificationNo}", tag="${r.tagNo}", joint="${r.jointNo}"`);
+
+ // Level 1: ๊ธฐ์กด ์—„๊ฒฉํ•œ ์กฐ๊ฑด
+ if (r.no && r.no.trim() || r.identificationNo && r.identificationNo.trim()) {
+ console.log(`โœ… Level 1 validation passed (has no or identification)`);
+ return true;
+ }
+
+ // Level 2: ์™„ํ™”๋œ ์กฐ๊ฑด - ์ฃผ์š” ํ•„๋“œ ์ค‘ 2๊ฐœ ์ด์ƒ
+ const mainFields = [
+ r.no?.trim(),
+ r.identificationNo?.trim(),
+ r.tagNo?.trim(),
+ r.jointNo?.trim()
+ ].filter(Boolean);
+
+ if (mainFields.length >= 2) {
+ console.log(`โœ… Level 2 validation passed (${mainFields.length} main fields present)`);
+ return true;
+ }
+
+ // Level 3: ๋” ๊ด€๋Œ€ํ•œ ์กฐ๊ฑด - ์–ด๋–ค ํ•„๋“œ๋“  ํ•˜๋‚˜๋ผ๋„ ์˜๋ฏธ์žˆ๋Š” ๊ฐ’
+ const allFields = [
+ r.no?.trim(),
+ r.identificationNo?.trim(),
+ r.tagNo?.trim(),
+ r.jointNo?.trim(),
+ r.jointType?.trim(),
+ r.weldingDate?.trim()
+ ].filter(field => field && field.length > 1); // 1๊ธ€์ž ์ด์ƒ
+
+ if (allFields.length >= 1) {
+ console.log(`โœ… Level 3 validation passed (${allFields.length} fields with meaningful content)`);
+ return true;
+ }
+
+ console.log(`โŒ Validation failed - no meaningful content found`);
+ return false;
+}
function scoreRow (r: ExtractedRow): number {
const w: Record<keyof ExtractedRow, number> = {