summaryrefslogtreecommitdiff
path: root/app/api
diff options
context:
space:
mode:
Diffstat (limited to 'app/api')
-rw-r--r--app/api/ocr/enhanced/route.ts41
-rw-r--r--app/api/ocr/utils/tableExtraction.ts651
2 files changed, 93 insertions, 599 deletions
diff --git a/app/api/ocr/enhanced/route.ts b/app/api/ocr/enhanced/route.ts
index f0a15707..d0e5aabd 100644
--- a/app/api/ocr/enhanced/route.ts
+++ b/app/api/ocr/enhanced/route.ts
@@ -320,13 +320,15 @@ async function processImageOptimized(
// 결과 처리
const reportNo = extractReportNo(ocrResult);
+ const inspectionDate = extractinspectionDate(ocrResult);
const analysis = analyzeOCRQuality(ocrResult);
const rawTables = await extractTablesFromOCR(ocrResult) as BaseExtractedRow[][];
const extractedTables: ExtractedRow[][] = rawTables.map(table =>
table.map(row => ({
...row,
- reportNo
+ reportNo,
+ inspectionDate
}))
);
@@ -497,12 +499,14 @@ async function processAsWholePDF(
// 결과 처리
const reportNo = extractReportNo(ocrResult);
+ const inspectionDate = extractinspectionDate(ocrResult);
const rawTables = await extractTablesFromOCR(ocrResult) as BaseExtractedRow[][];
const extractedTables: ExtractedRow[][] = rawTables.map(table =>
table.map(row => ({
...row,
- reportNo
+ reportNo,
+ inspectionDate
}))
);
@@ -552,12 +556,14 @@ async function processPageByPageOptimized(
// 결과 처리
const reportNo = extractReportNo(ocrResult);
+ const inspectionDate = extractinspectionDate(ocrResult);
const rawTables = await extractTablesFromOCR(ocrResult) as BaseExtractedRow[][];
const pageTables: ExtractedRow[][] = rawTables.map(table =>
table.map(row => ({
...row,
- reportNo
+ reportNo,
+ inspectionDate
}))
);
@@ -738,11 +744,13 @@ async function saveToDatabase({
if (table.length > 0) {
const rowsData: NewOcrRow[] = table.map((row, rowIndex) => ({
tableId: savedTable.id,
+ fileName:file.name,
sessionId,
rowIndex,
reportNo: row.reportNo ?? null,
no: row.no ?? null,
identificationNo: row.identificationNo ?? null,
+ inspectionDate: row.inspectionDate ?? null,
tagNo: row.tagNo ?? null,
jointNo: row.jointNo ?? null,
jointType: row.jointType ?? null,
@@ -855,6 +863,33 @@ function extractReportNo(ocrResult: any): string {
}
}
+function extractinspectionDate(ocrResult: any): string {
+ try {
+ const table = ocrResult.images?.[0]?.tables?.[0];
+ if (!table?.cells?.length) return 'UNKNOWN';
+
+ const target = table.cells.find(
+ (c: any) => c.rowIndex === 4 && c.columnIndex === 3
+ );
+ if (!target) return 'UNKNOWN';
+
+ const reportNo = cellText(target).replace(/\s+/g, '');
+ return reportNo || 'UNKNOWN';
+
+ } catch (e) {
+ console.error('extractinspectionDate 오류:', e);
+ return 'UNKNOWN';
+ }
+
+ function cellText(cell: any): string {
+ return (cell.cellTextLines ?? [])
+ .flatMap((l: any) =>
+ (l.cellWords ?? []).map((w: any) => (w.inferText ?? '').trim())
+ )
+ .join(' ');
+ }
+}
+
// OCR API 호출 (기존과 동일)
async function callOCRAPI(base64: string, format: string, filename: string, rotation?: number): Promise<any> {
console.log('🌐 === OCR API CALL DEBUG ===');
diff --git a/app/api/ocr/utils/tableExtraction.ts b/app/api/ocr/utils/tableExtraction.ts
index 0a727f84..3c44e7fb 100644
--- a/app/api/ocr/utils/tableExtraction.ts
+++ b/app/api/ocr/utils/tableExtraction.ts
@@ -69,107 +69,40 @@ export async function extractTablesFromOCR (ocrResult: any): Promise<ExtractedRo
function isRelevantTable (table: OCRTable): boolean {
const headers = table.cells.filter(c => c.rowIndex < 3).map(getCellText).join(' ').toLowerCase();
- console.log(`🔍 Checking table relevance. Headers: "${headers}"`);
-
- // 기존 조건
- const hasNoColumn = /\bno\b|번호/.test(headers);
- const hasIdentification = /identification|식별|ident|id/.test(headers);
-
- console.log(`📝 Has NO column: ${hasNoColumn}`);
- console.log(`📝 Has Identification: ${hasIdentification}`);
-
- // 기본 조건
- if (hasNoColumn && hasIdentification) {
- console.log(`✅ Table passes strict criteria`);
- return true;
- }
-
- // 완화된 조건들
- const relaxedConditions = [
- // 조건 1: 테이블에 여러 열이 있고 숫자나 식별자 패턴이 보이는 경우
- table.cells.length > 10 && /\d+/.test(headers),
-
- // 조건 2: joint, tag, weld 등 관련 키워드가 있는 경우
- /joint|tag|weld|type|date/.test(headers),
-
- // 조건 3: 식별번호 패턴이 보이는 경우 (하이픈이 포함된 문자열)
- headers.includes('-') && headers.length > 20,
-
- // 조건 4: 한국어 관련 키워드
- /용접|조인트|태그/.test(headers)
- ];
-
- const passedConditions = relaxedConditions.filter(Boolean).length;
- console.log(`📊 Relaxed conditions passed: ${passedConditions}/${relaxedConditions.length}`);
-
- if (passedConditions >= 1) {
- console.log(`✅ Table passes relaxed criteria`);
- return true;
- }
-
- console.log(`❌ Table does not meet any criteria`);
- return false;
+ return /\bno\b|번호/.test(headers) && /identification|식별|ident|id/.test(headers);
}
+
/* -------------------------------------------------------------------------- */
/* 표 해석 */
/* -------------------------------------------------------------------------- */
function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): ExtractedRow[] {
- console.log(`🔧 Starting extractTableData for table ${imgIdx}-${tblIdx}`);
-
const grid = buildGrid(table);
- console.log(`📊 Grid size: ${grid.length} rows x ${grid[0]?.length || 0} columns`);
-
const headerRowIdx = findHeaderRow(grid);
- console.log(`📍 Header row index: ${headerRowIdx}`);
- if (headerRowIdx === -1) {
- console.log(`❌ No header row found`);
- return [];
- }
+ console.log(headerRowIdx,"headerRowIdx")
+
+ if (headerRowIdx === -1) return [];
- const format = detectFormat(grid[headerRowIdx]);
- const mapping = mapColumns(grid[headerRowIdx]);
-
- console.log(`📋 Detected format: ${format}`);
- console.log(`🗂️ Column mapping:`, mapping);
+ const format = detectFormat(grid[headerRowIdx]);
+ const mapping = mapColumns(grid[headerRowIdx]);
const seen = new Set<string>();
const data: ExtractedRow[] = [];
for (let r = headerRowIdx + 1; r < grid.length; r++) {
const row = grid[r];
-
- if (isBlankRow(row)) {
- console.log(`⏭️ Row ${r}: blank, skipping`);
- continue;
- }
-
- console.log(`🔍 Processing row ${r}: [${row.join(' | ')}]`);
+ if (isBlankRow(row)) continue;
const parsed = buildRow(row, format, mapping, tblIdx, r);
- if (!parsed) {
- console.log(`❌ Row ${r}: failed to parse`);
- continue;
- }
-
- if (!isValidRow(parsed)) {
- console.log(`❌ Row ${r}: invalid (no: "${parsed.no}", id: "${parsed.identificationNo}")`);
- continue;
- }
+ if (!parsed || !isValidRow(parsed)) continue;
const key = `${parsed.no}-${parsed.identificationNo}`;
- if (seen.has(key)) {
- console.log(`⚠️ Row ${r}: duplicate key "${key}", skipping`);
- continue;
- }
-
+ if (seen.has(key)) continue;
seen.add(key);
+
data.push(parsed);
- console.log(`✅ Row ${r}: added (${JSON.stringify(parsed)})`);
}
-
- console.log(`🎯 Table ${imgIdx}-${tblIdx}: extracted ${data.length} valid rows`);
return data;
}
@@ -178,39 +111,18 @@ function extractTableData (table: OCRTable, imgIdx: number, tblIdx: number): Ext
/* -------------------------------------------------------------------------- */
function buildGrid (table: OCRTable): string[][] {
- console.log(`🔧 Building grid from ${table.cells.length} cells`);
-
const maxR = Math.max(...table.cells.map(c => c.rowIndex + c.rowSpan - 1));
const maxC = Math.max(...table.cells.map(c => c.columnIndex + c.columnSpan - 1));
-
- console.log(`📊 Grid dimensions: ${maxR + 1} rows x ${maxC + 1} columns`);
-
const grid = Array.from({ length: maxR + 1 }, () => Array(maxC + 1).fill(''));
- // 셀별 상세 정보 출력
- table.cells.forEach((cell, idx) => {
+ table.cells.forEach(cell => {
const txt = getCellText(cell);
- console.log(`📱 Cell ${idx}: (${cell.rowIndex},${cell.columnIndex}) span(${cell.rowSpan},${cell.columnSpan}) = "${txt}"`);
-
for (let r = cell.rowIndex; r < cell.rowIndex + cell.rowSpan; r++) {
for (let c = cell.columnIndex; c < cell.columnIndex + cell.columnSpan; c++) {
- const oldValue = grid[r][c];
- const newValue = oldValue ? `${oldValue} ${txt}` : txt;
- grid[r][c] = newValue;
-
- if (oldValue) {
- console.log(`🔄 Grid[${r}][${c}]: "${oldValue}" → "${newValue}"`);
- }
+ grid[r][c] = grid[r][c] ? `${grid[r][c]} ${txt}` : txt;
}
}
});
-
- // 최종 그리드 출력
- console.log(`📋 Final grid:`);
- grid.forEach((row, r) => {
- console.log(` Row ${r}: [${row.map(cell => `"${cell}"`).join(', ')}]`);
- });
-
return grid;
}
@@ -219,52 +131,13 @@ function getCellText (cell: TableCell): string {
}
function findHeaderRow (grid: string[][]): number {
- console.log(`🔍 Finding header row in grid with ${grid.length} rows`);
-
- for (let i = 0; i < Math.min(5, grid.length); i++) {
- const rowText = grid[i].join(' ').toLowerCase();
- console.log(`📝 Row ${i}: "${rowText}"`);
-
- // 기존 엄격한 조건
- if (/\bno\b|번호/.test(rowText) && /identification|식별|ident/.test(rowText)) {
- console.log(`✅ Row ${i}: Strict match`);
- return i;
- }
-
- // 완화된 조건들
- const relaxedMatches = [
- // 1. NO 컬럼 + 다른 관련 키워드
- (/\bno\b|번호/.test(rowText) && /joint|tag|type|weld|date/.test(rowText)),
-
- // 2. ID/식별 + 다른 관련 키워드
- (/identification|식별|ident|id/.test(rowText) && /joint|tag|no|type/.test(rowText)),
-
- // 3. 용접 관련 키워드가 여러 개
- (rowText.match(/joint|tag|type|weld|date|no|id|식별|번호|용접/g)?.length >= 3),
-
- // 4. 첫 번째 행이고 여러 단어가 있는 경우
- (i === 0 && rowText.split(/\s+/).filter(w => w.length > 1).length >= 3)
- ];
-
- if (relaxedMatches.some(Boolean)) {
- console.log(`✅ Row ${i}: Relaxed match`);
- return i;
- }
-
- console.log(`❌ Row ${i}: No match`);
- }
-
- // 최후의 수단: 첫 번째 비어있지 않은 행
for (let i = 0; i < Math.min(3, grid.length); i++) {
- if (grid[i].some(cell => cell.trim().length > 0)) {
- console.log(`⚠️ Using row ${i} as fallback header`);
- return i;
- }
+ const t = grid[i].join(' ').toLowerCase();
+ if (/\bno\b|번호/.test(t) && /identification|식별|ident/.test(t)) return i;
}
-
- console.log(`❌ No header row found`);
return -1;
}
+
/* -------------------------------------------------------------------------- */
/* Column Mapping */
/* -------------------------------------------------------------------------- */
@@ -276,153 +149,19 @@ function detectFormat (header: string[]): 'format1' | 'format2' {
function mapColumns (header: string[]): ColumnMapping {
const mp: ColumnMapping = { no: -1, identification: -1, tagNo: -1, jointNo: -1, jointType: -1, weldingDate: -1 };
-
- console.log(`🗂️ Smart mapping columns from header: [${header.map(h => `"${h}"`).join(', ')}]`);
- // === STEP 1: 기존 개별 컬럼 매핑 ===
header.forEach((h, i) => {
- const t = h.toLowerCase().trim();
- console.log(`📋 Column ${i}: "${h}" → "${t}"`);
-
- if (mp.no === -1 && (/^no\.?$/i.test(t) || /^번호$/i.test(t) || /^순번$/i.test(t))) {
- mp.no = i;
- console.log(`✅ NO column (individual) mapped to index ${i}`);
- }
-
- if (mp.identification === -1 && (/identification.*no/i.test(t) || /식별.*번호/i.test(t))) {
- mp.identification = i;
- console.log(`✅ Identification column (individual) mapped to index ${i}`);
- }
-
- if (mp.tagNo === -1 && (/tag.*no/i.test(t) || /태그.*번호/i.test(t))) {
- mp.tagNo = i;
- console.log(`✅ Tag No column (individual) mapped to index ${i}`);
- }
-
- if (mp.jointNo === -1 && (/joint.*no/i.test(t) || /조인트.*번호/i.test(t) || /oint.*no/i.test(t))) {
- mp.jointNo = i;
- console.log(`✅ Joint No column (individual) mapped to index ${i}`);
- }
-
- if (mp.jointType === -1 && (/joint.*type/i.test(t) || /^type$/i.test(t) || /형태/i.test(t))) {
- mp.jointType = i;
- console.log(`✅ Joint Type column (individual) mapped to index ${i}`);
- }
-
- if (mp.weldingDate === -1 && (/welding.*date/i.test(t) || /weld.*date/i.test(t) || /^date$/i.test(t) || /날짜/i.test(t))) {
- mp.weldingDate = i;
- console.log(`✅ Welding Date column (individual) mapped to index ${i}`);
- }
- });
-
- // === STEP 2: 실용적 추론 ===
- console.log(`🤖 Starting practical column inference...`);
-
- // NO 컬럼이 매핑되지 않았다면, 첫 번째 컬럼을 NO로 추정
- if (mp.no === -1) {
- mp.no = 0;
- console.log(`🔮 NO column inferred as index 0 (first column)`);
- }
-
- // Identification 컬럼 찾기 - "identification" 키워드가 포함된 컬럼 중에서
- if (mp.identification === -1) {
- for (let i = 0; i < header.length; i++) {
- const text = header[i].toLowerCase();
- if (text.includes('identification') || text.includes('식별')) {
- mp.identification = i;
- console.log(`🆔 Identification column found at index ${i}`);
- break;
- }
- }
- }
-
- // Tag No 컬럼 찾기 - "tag" 키워드가 포함된 컬럼 중에서
- if (mp.tagNo === -1) {
- for (let i = 0; i < header.length; i++) {
- const text = header[i].toLowerCase();
- if (text.includes('tag') && !text.includes('no')) {
- mp.tagNo = i;
- console.log(`🏷️ Tag column found at index ${i}`);
- break;
- }
- }
- }
-
- // Joint No 컬럼 찾기
- if (mp.jointNo === -1) {
- for (let i = 0; i < header.length; i++) {
- const text = header[i].toLowerCase();
- if (text.includes('joint') || text.includes('oint')) {
- mp.jointNo = i;
- console.log(`🔗 Joint column found at index ${i}`);
- break;
- }
- }
- }
-
- // === STEP 3: 패턴 기반 추론 (마지막 수단) ===
- console.log(`🎯 Pattern-based fallback mapping...`);
-
- // 전체 헤더에서 실제 식별번호 패턴이 있는 컬럼 찾기
- if (mp.identification === -1) {
- for (let i = 0; i < header.length; i++) {
- const text = header[i];
- // 하이픈이 포함된 긴 문자열이 있는 컬럼
- if (text.includes('-') && text.length > 15) {
- mp.identification = i;
- console.log(`🆔 Identification inferred at index ${i} (contains ID pattern)`);
- break;
- }
- }
- }
-
- // 숫자 패턴이 있는 컬럼을 Tag No로 추정
- if (mp.tagNo === -1) {
- for (let i = 1; i < header.length; i++) { // 첫 번째 컬럼 제외
- const text = header[i];
- // 7-8자리 숫자가 있는 컬럼
- if (/\d{7,8}/.test(text)) {
- mp.tagNo = i;
- console.log(`🏷️ Tag No inferred at index ${i} (contains number pattern)`);
- break;
- }
- }
- }
-
- // === STEP 4: 기본값 설정 ===
- console.log(`🔧 Setting default values for unmapped columns...`);
-
- // 여전히 매핑되지 않은 중요한 컬럼들에 대해 순서 기반 추정
- const essentialColumns = [
- { key: 'identification', currentValue: mp.identification, defaultIndex: 1 },
- { key: 'tagNo', currentValue: mp.tagNo, defaultIndex: 2 },
- { key: 'jointNo', currentValue: mp.jointNo, defaultIndex: 3 },
- { key: 'jointType', currentValue: mp.jointType, defaultIndex: 4 },
- { key: 'weldingDate', currentValue: mp.weldingDate, defaultIndex: Math.min(5, header.length - 1) }
- ];
-
- essentialColumns.forEach(col => {
- if ((col.currentValue as number) === -1 && col.defaultIndex < header.length) {
- (mp as any)[col.key] = col.defaultIndex;
- console.log(`🔧 ${col.key} set to default index ${col.defaultIndex}`);
- }
+ const t = h.toLowerCase();
+ if (/^no\.?$/.test(t) && !/ident|tag|joint/.test(t)) mp.no = i;
+ else if (/identification|ident/.test(t)) mp.identification = i;
+ else if (/tag.*no/.test(t)) mp.tagNo = i;
+ else if (/joint.*no/.test(t)) mp.jointNo = i;
+ else if (/joint.*type/.test(t) || (/^type$/.test(t) && mp.jointType === -1)) mp.jointType = i;
+ else if (/welding|date/.test(t)) mp.weldingDate = i;
});
-
- console.log(`🎯 Final optimized column mapping:`, mp);
-
- // === STEP 5: 매핑 품질 검증 ===
- const mappedCount = Object.values(mp).filter(v => v !== -1).length;
- const totalColumns = Object.keys(mp).length;
- const mappingQuality = mappedCount / totalColumns;
-
- console.log(`📊 Mapping quality: ${mappedCount}/${totalColumns} (${(mappingQuality * 100).toFixed(1)}%)`);
-
- if (mappingQuality < 0.5) {
- console.warn(`⚠️ Low mapping quality detected. Consider manual adjustment.`);
- }
-
return mp;
}
+
/* -------------------------------------------------------------------------- */
/* Row Extraction */
/* -------------------------------------------------------------------------- */
@@ -434,351 +173,71 @@ function buildRow (
tblIdx: number,
rowIdx: number
): ExtractedRow | null {
- console.log(`🔨 Building row from: [${row.map(r => `"${r}"`).join(', ')}]`);
- console.log(`📋 Using mapping:`, mp);
- console.log(`📄 Format: ${format}`);
-
const out: ExtractedRow = {
- no: '',
+ no: mp.no >= 0 ? clean(row[mp.no]) : '',
identificationNo: '',
tagNo: '',
jointNo: '',
- jointType: '',
+ jointType: mp.jointType >= 0 ? clean(row[mp.jointType]) : '',
weldingDate: '',
confidence: 0,
sourceTable: tblIdx,
sourceRow: rowIdx,
};
- // === STEP 1: 매핑된 컬럼에서 기본 추출 ===
-
- // NO 컬럼 추출
- if (mp.no >= 0 && mp.no < row.length) {
- const rawNo = clean(row[mp.no]);
- // NO 필드에서 첫 번째 숫자 패턴 추출
- const noMatch = rawNo.match(/\b(\d{2,4})\b/);
- out.no = noMatch ? noMatch[1] : rawNo;
- console.log(`📍 NO from column ${mp.no}: "${out.no}" (raw: "${rawNo}")`);
- }
-
- // Joint Type, Welding Date는 기존대로
- if (mp.jointType >= 0 && mp.jointType < row.length) {
- out.jointType = clean(row[mp.jointType]);
- console.log(`🔗 Joint Type from column ${mp.jointType}: "${out.jointType}"`);
+ if (mp.weldingDate >= 0) out.weldingDate = clean(row[mp.weldingDate]);
+ else {
+ const idx = row.findIndex(col => /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/.test(col));
+ if (idx >= 0) out.weldingDate = clean(row[idx]);
}
- if (mp.weldingDate >= 0 && mp.weldingDate < row.length) {
- out.weldingDate = clean(row[mp.weldingDate]);
- console.log(`📅 Welding Date from column ${mp.weldingDate}: "${out.weldingDate}"`);
- }
-
- // === STEP 2: Format별 데이터 추출 ===
-
if (format === 'format2') {
- console.log(`📄 Processing Format 2 (separate columns)`);
-
- if (mp.identification >= 0 && mp.identification < row.length) {
- out.identificationNo = clean(row[mp.identification]);
- console.log(`🆔 Identification from column ${mp.identification}: "${out.identificationNo}"`);
- }
-
- if (mp.jointNo >= 0 && mp.jointNo < row.length) {
- out.jointNo = clean(row[mp.jointNo]);
- console.log(`🔗 Joint No from column ${mp.jointNo}: "${out.jointNo}"`);
- }
-
- if (mp.tagNo >= 0 && mp.tagNo < row.length) {
- out.tagNo = clean(row[mp.tagNo]);
- console.log(`🏷️ Tag No from column ${mp.tagNo}: "${out.tagNo}"`);
- }
+ if (mp.identification >= 0) out.identificationNo = clean(row[mp.identification]);
+ if (mp.jointNo >= 0) out.jointNo = clean(row[mp.jointNo]);
+ if (mp.tagNo >= 0) out.tagNo = clean(row[mp.tagNo]);
} else {
- console.log(`📄 Processing Format 1 (combined identification column)`);
-
- let combinedText = '';
-
- // 매핑된 identification 컬럼에서 텍스트 가져오기
- if (mp.identification >= 0 && mp.identification < row.length) {
- combinedText = row[mp.identification];
- console.log(`🆔 Combined text from column ${mp.identification}: "${combinedText}"`);
- }
-
- const parsed = parseIdentificationData(combinedText);
+ const combined = mp.identification >= 0 ? row[mp.identification] : '';
+ const parsed = parseIdentificationData(combined);
out.identificationNo = parsed.identificationNo;
- out.jointNo = parsed.jointNo;
- out.tagNo = parsed.tagNo;
-
- console.log(`📊 Parsed from identification column:`, parsed);
+ out.jointNo = parsed.jointNo;
+ out.tagNo = parsed.tagNo;
}
- // === STEP 3: 적극적 패턴 매칭으로 누락된 필드 채우기 ===
- console.log(`🔍 Aggressive pattern matching for missing fields...`);
-
- const allText = row.join(' ');
- console.log(`📝 Full row text: "${allText}"`);
-
- // NO 필드가 비어있다면 첫 번째 컬럼에서 숫자 패턴 찾기
- if (!out.no && row.length > 0) {
- const firstCol = clean(row[0]);
- const noPatterns = [
- /\b(\d{3})\b/g, // 3자리 숫자
- /\b(\d{2,4})\b/g, // 2-4자리 숫자
- /^(\d+)/ // 맨 앞 숫자
- ];
-
- for (const pattern of noPatterns) {
- const matches = firstCol.match(pattern);
- if (matches && matches.length > 0) {
- out.no = matches[0].replace(/\D/g, ''); // 숫자만 추출
- console.log(`📍 NO found via pattern in first column: "${out.no}"`);
- break;
- }
- }
- }
-
- // Identification No 패턴 찾기 (하이픈이 포함된 긴 문자열)
- if (!out.identificationNo) {
- const idPatterns = [
- /[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9\-]+/g,
- /-\d+[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9]+/g,
- /\b[A-Z]\d+[A-Z]-\d+-\d+-[A-Z]+-\d+-[A-Z0-9]+-[A-Z]-[A-Z0-9]+\b/g
- ];
-
- for (const pattern of idPatterns) {
- const matches = allText.match(pattern);
- if (matches && matches.length > 0) {
- out.identificationNo = matches[0];
- console.log(`🆔 Identification found via pattern: "${out.identificationNo}"`);
- break;
- }
- }
- }
-
- // Tag No 패턴 찾기 (7-8자리 숫자)
- if (!out.tagNo) {
- const tagMatches = allText.match(/\b\d{7,8}\b/g);
- if (tagMatches && tagMatches.length > 0) {
- out.tagNo = tagMatches[0];
- console.log(`🏷️ Tag found via pattern: "${out.tagNo}"`);
- }
- }
-
- // Joint No 패턴 찾기 (짧은 영숫자 조합)
- if (!out.jointNo) {
- const jointPatterns = [
- /\b[A-Z]{2,4}\d*\b/g, // 대문자+숫자 조합
- /\b[A-Za-z0-9]{2,6}\b/g // 일반적인 짧은 조합
- ];
-
- for (const pattern of jointPatterns) {
- const matches = allText.match(pattern);
- if (matches) {
- const candidates = matches.filter(m =>
- m !== out.no &&
- m !== out.tagNo &&
- m !== out.identificationNo &&
- m.length >= 2 && m.length <= 6 &&
- !/^(no|tag|joint|type|date|welding|project|samsung|class)$/i.test(m)
- );
-
- if (candidates.length > 0) {
- out.jointNo = candidates[0];
- console.log(`🔗 Joint found via pattern: "${out.jointNo}"`);
- break;
- }
- }
- }
- }
-
- // Welding Date 패턴 찾기
- if (!out.weldingDate) {
- const datePatterns = [
- /\d{4}[.\-/]\d{1,2}[.\-/]\d{1,2}/g,
- /\d{4}\.\d{2}\.\d{2}/g
- ];
-
- for (const pattern of datePatterns) {
- const matches = allText.match(pattern);
- if (matches && matches.length > 0) {
- out.weldingDate = matches[0];
- console.log(`📅 Date found via pattern: "${out.weldingDate}"`);
- break;
- }
- }
- }
-
- // === STEP 4: 품질 검증 및 후처리 ===
-
- // 추출된 값들 정리
- Object.keys(out).forEach(key => {
- const value = (out as any)[key];
- if (typeof value === 'string' && value) {
- (out as any)[key] = value.replace(/^[^\w]+|[^\w]+$/g, '').trim();
- }
- });
-
out.confidence = scoreRow(out);
-
- console.log(`📊 Final extracted row:`, out);
- console.log(`🎯 Row confidence: ${out.confidence}`);
-
- // 최소한의 데이터가 있는지 검증
- const hasAnyData = !!(out.no || out.identificationNo || out.tagNo || out.jointNo);
-
- if (!hasAnyData) {
- console.log(`⚠️ No meaningful data extracted from row`);
- return null;
- }
-
return out;
}
+
/* -------------------------------------------------------------------------- */
/* Format‑1 셀 파싱 */
/* -------------------------------------------------------------------------- */
function parseIdentificationData (txt: string): { identificationNo: string; jointNo: string; tagNo: string } {
- console.log(`🔍 Parsing identification data from: "${txt}"`);
-
const cleaned = clean(txt);
- if (!cleaned) {
- console.log(`❌ Empty input text`);
- return { identificationNo: '', jointNo: '', tagNo: '' };
- }
+ if (!cleaned) return { identificationNo: '', jointNo: '', tagNo: '' };
- console.log(`🧹 Cleaned text: "${cleaned}"`);
-
- const result = { identificationNo: '', jointNo: '', tagNo: '' };
-
- // 1. Identification No 추출 (하이픈이 2개 이상 포함된 패턴)
- const idPatterns = [
- /[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9\-]+/g, // 기본 패턴
- /-\d+[A-Za-z0-9]+-[A-Za-z0-9]+-[A-Za-z0-9]+/g, // 앞에 하이픈이 있는 경우
- /\b[A-Za-z0-9]{2,}-[A-Za-z0-9]{2,}-[A-Za-z0-9]{2,}\b/g // 더 엄격한 패턴
- ];
-
- for (const pattern of idPatterns) {
- const matches = cleaned.match(pattern);
- if (matches && matches.length > 0) {
- // 가장 긴 매치를 선택
- result.identificationNo = matches.reduce((a, b) => a.length >= b.length ? a : b);
- console.log(`🆔 Found identification: "${result.identificationNo}"`);
- break;
- }
- }
-
- // 2. Tag No 추출 (7-8자리 숫자)
- const tagPatterns = [
- /\btag[:\s]*(\d{7,8})\b/i, // "tag: 1234567" 형태
- /\b(\d{7,8})\b/g // 단순 7-8자리 숫자
- ];
-
- for (const pattern of tagPatterns) {
- const matches = cleaned.match(pattern);
- if (matches) {
- if (pattern.source.includes('tag')) {
- result.tagNo = matches[1] || matches[0];
- } else {
- // 모든 7-8자리 숫자를 찾아서 가장 적절한 것 선택
- const candidates = matches.filter(m => m && m.length >= 7 && m.length <= 8);
- if (candidates.length > 0) {
- result.tagNo = candidates[0];
- }
- }
- if (result.tagNo) {
- console.log(`🏷️ Found tag: "${result.tagNo}"`);
- break;
- }
- }
- }
-
- // 3. Joint No 추출 (나머지 토큰 중에서)
const tokens = cleaned.split(/\s+/).map(clean).filter(Boolean);
- console.log(`📝 All tokens: [${tokens.join(', ')}]`);
-
- // 이미 사용된 토큰들 제외
- const usedTokens = new Set([result.identificationNo, result.tagNo]);
- const remainingTokens = tokens.filter(token =>
- !usedTokens.has(token) &&
- !result.identificationNo.includes(token) &&
- !result.tagNo.includes(token) &&
- token.length > 1 &&
- !/^(tag|joint|no|identification|식별|번호)$/i.test(token)
- );
-
- console.log(`🔄 Remaining tokens for joint: [${remainingTokens.join(', ')}]`);
-
- if (remainingTokens.length > 0) {
- // 가장 짧고 알파벳+숫자 조합인 토큰을 Joint No로 선택
- const jointCandidates = remainingTokens
- .filter(token => /^[A-Za-z0-9]+$/.test(token) && token.length >= 2 && token.length <= 8)
- .sort((a, b) => a.length - b.length);
-
- if (jointCandidates.length > 0) {
- result.jointNo = jointCandidates[0];
- console.log(`🔗 Found joint: "${result.jointNo}"`);
- } else if (remainingTokens.length > 0) {
- // 후보가 없으면 가장 짧은 토큰 사용
- result.jointNo = remainingTokens.reduce((a, b) => a.length <= b.length ? a : b);
- console.log(`🔗 Found joint (fallback): "${result.jointNo}"`);
- }
- }
-
- // 4. 결과 검증 및 정리
- Object.keys(result).forEach(key => {
- const value = (result as any)[key];
- if (value && typeof value === 'string') {
- (result as any)[key] = value.replace(/^[^\w]+|[^\w]+$/g, ''); // 앞뒤 특수문자 제거
- }
- });
-
- console.log(`📊 Final parsed result:`, result);
- return result;
+
+ // Identification 후보: 하이픈이 2개 이상 포함된 토큰 가운데 가장 긴 것
+ const idCand = tokens.filter(t => t.split('-').length >= 3).sort((a, b) => b.length - a.length);
+ const identificationNo = idCand[0] || '';
+
+ const residual = tokens.filter(t => t !== identificationNo);
+ if (!residual.length) return { identificationNo, jointNo: '', tagNo: '' };
+
+ residual.sort((a, b) => a.length - b.length);
+ const jointNo = residual[0] || '';
+ const tagNo = residual[residual.length - 1] || '';
+
+ return { identificationNo, jointNo, tagNo };
}
+
/* -------------------------------------------------------------------------- */
/* Helpers */
/* -------------------------------------------------------------------------- */
const clean = (s: string = '') => s.replace(/[\r\n\t]+/g, ' ').replace(/\s+/g, ' ').trim();
const isBlankRow = (row: string[]) => row.every(c => !clean(c));
-function isValidRow (r: ExtractedRow): boolean {
- console.log(`✅ Validating row: no="${r.no}", id="${r.identificationNo}", tag="${r.tagNo}", joint="${r.jointNo}"`);
-
- // Level 1: 기존 엄격한 조건
- if (r.no && r.no.trim() || r.identificationNo && r.identificationNo.trim()) {
- console.log(`✅ Level 1 validation passed (has no or identification)`);
- return true;
- }
-
- // Level 2: 완화된 조건 - 주요 필드 중 2개 이상
- const mainFields = [
- r.no?.trim(),
- r.identificationNo?.trim(),
- r.tagNo?.trim(),
- r.jointNo?.trim()
- ].filter(Boolean);
-
- if (mainFields.length >= 2) {
- console.log(`✅ Level 2 validation passed (${mainFields.length} main fields present)`);
- return true;
- }
-
- // Level 3: 더 관대한 조건 - 어떤 필드든 하나라도 의미있는 값
- const allFields = [
- r.no?.trim(),
- r.identificationNo?.trim(),
- r.tagNo?.trim(),
- r.jointNo?.trim(),
- r.jointType?.trim(),
- r.weldingDate?.trim()
- ].filter(field => field && field.length > 1); // 1글자 이상
-
- if (allFields.length >= 1) {
- console.log(`✅ Level 3 validation passed (${allFields.length} fields with meaningful content)`);
- return true;
- }
-
- console.log(`❌ Validation failed - no meaningful content found`);
- return false;
-}
+const isValidRow = (r: ExtractedRow) => !!(r.no || r.identificationNo);
function scoreRow (r: ExtractedRow): number {
const w: Record<keyof ExtractedRow, number> = {
@@ -808,4 +267,4 @@ export function analyzeOCRQuality (ocrResult: any) {
});
return { confidence: cnt ? conf / cnt : 0, tablesFound: tbl, textQuality: cnt ? kw / cnt : 0, keywordCount: kw };
-}
+} \ No newline at end of file