File size: 19,210 Bytes
7dfe46c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
import logging
import time
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
import hashlib
import os 
import sys 


sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from src.document_processor import (
    DocumentProcessor, DocumentProcessorFactory, ProcessedDocument, 
    DocumentChunk, ProcessingStatus, DocumentType
)
from src.embedding_system import EmbeddingSystem
from src.vector_store import QdrantVectorStore
from src.metadata_manager import MetadataManager, DocumentMetadata
from src.image_processor import ImageProcessor


try:
    from logger.custom_logger import CustomLoggerTracker
    custom_log = CustomLoggerTracker()
    logger = custom_log.get_logger("ingestion_pipeline")

except ImportError:
    # Fallback to standard logging if custom logger not available
    logger = logging.getLogger("ingestion_pipeline")


@dataclass
class IngestionResult:
    """Result of document ingestion."""
    document_id: str
    filename: str
    success: bool
    processing_time: float
    chunks_created: int
    chunks_indexed: int
    error_message: Optional[str] = None
    warnings: List[str] = None
    
    def __post_init__(self):
        if self.warnings is None:
            self.warnings = []


@dataclass
class IngestionStats:
    """Statistics for batch ingestion."""
    total_documents: int
    successful_documents: int
    failed_documents: int
    total_chunks: int
    total_processing_time: float
    average_processing_time: float
    documents_by_type: Dict[str, int]
    errors: List[str]



def jina_embeddings(text: str) -> List[float]:
    JINA_API_KEY= "jina_a75b55a8a9524bb697ea016b164211ebF5IduSgA0Ku8lmI0pS9fnXoZ83Su"
    import requests

    headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer jina_a75b55a8a9524bb697ea016b164211ebF5IduSgA0Ku8lmI0pS9fnXoZ83Su'}

    data = {
        "model": "jina-embeddings-v3",
        "task": "retrieval.passage",
        "input": text}

    response = requests.post('https://api.jina.ai/v1/embeddings', headers=headers, json=data)
    return response.json()['data'][0]['embedding']


class DocumentIngestionPipeline:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        # Initialize components
        self.embedding_system = EmbeddingSystem(config)
        self.vector_store = QdrantVectorStore(config)
        self.metadata_manager = MetadataManager(config)
        # Initialize components with correct vector dimensions
        self.vector_size = config.get('vector_size', 1024)  # Match Jina's dimension
        self.config['vector_size'] = self.vector_size  # Update config for other components
        
        # Initialize image processor for OCR
        self.image_processor = ImageProcessor(config)
        
        # Pipeline settings
        self.chunk_size = config.get('chunk_size', 512)
        self.chunk_overlap = config.get('chunk_overlap', 50)
        self.batch_size = config.get('embedding_batch_size', 32)
        self.max_workers = config.get('max_workers', 4)
        self.enable_ocr = config.get('image_processing', True)
        
        logger.info(f"Document ingestion pipeline initialized")
    
    def ingest_document(self, file_path: str, document_id: Optional[str] = None) -> IngestionResult:
        """
        Ingest a single document through the complete pipeline.
        
        Args:
            file_path: Path to the document file
            document_id: Optional custom document ID
            
        Returns:
            IngestionResult with processing details
        """
        start_time = time.time()
        file_path_obj = Path(file_path)
        filename = file_path_obj.name
        
        try:
            logger.info(f"Starting ingestion of document: {filename}")
            
            # Generate document ID if not provided
            if not document_id:
                document_id = self._generate_document_id(file_path)
            
            # Check if document already exists
            existing_metadata = self.metadata_manager.get_document_metadata(document_id)
            if existing_metadata and existing_metadata.processing_status == ProcessingStatus.COMPLETED:
                logger.info(f"Document {filename} already processed, skipping")
                return IngestionResult(
                    document_id=document_id,
                    filename=filename,
                    success=True,
                    processing_time=0.0,
                    chunks_created=existing_metadata.total_chunks,
                    chunks_indexed=existing_metadata.total_chunks,
                    warnings=["Document already processed"]
                )
            
            # Step 1: Process document
            processed_doc = self._process_document(file_path)
            if processed_doc.processing_status == ProcessingStatus.FAILED:
                return IngestionResult(
                    document_id=document_id,
                    filename=filename,
                    success=False,
                    processing_time=time.time() - start_time,
                    chunks_created=0,
                    chunks_indexed=0,
                    error_message=processed_doc.error_message
                )
            
            # Step 2: Process images with OCR if enabled
            if self.enable_ocr and processed_doc.images:
                processed_doc.images = self.image_processor.batch_process_images(processed_doc.images)
            
            # Step 3: Create document chunks
            processor = DocumentProcessorFactory.create_processor(file_path, self.config)
            chunks = processor.extract_chunks(processed_doc, self.chunk_size, self.chunk_overlap)
            
            if not chunks:
                logger.warning(f"No chunks created for document: {filename}")
                return IngestionResult(
                    document_id=document_id,
                    filename=filename,
                    success=False,
                    processing_time=time.time() - start_time,
                    chunks_created=0,
                    chunks_indexed=0,
                    error_message="No content chunks could be created"
                )
                
                
                
            
            # Step 4: Generate embeddings
            chunk_texts = [chunk.content for chunk in chunks]
            logger.info(chunk_texts[:2])
            # embeddings = self.embedding_system.generate_embeddings(chunk_texts)
            embeddings = [jina_embeddings(text) for text in chunk_texts]    
            
            
            if not embeddings or len(embeddings) != len(chunks):
                logger.error(f"Embedding generation failed for document: {filename}")
                return IngestionResult(
                    document_id=document_id,
                    filename=filename,
                    success=False,
                    processing_time=time.time() - start_time,
                    chunks_created=len(chunks),
                    chunks_indexed=0,
                    error_message="Failed to generate embeddings"
                )
            
            # Attach embeddings to chunks
            for chunk, embedding in zip(chunks, embeddings):
                chunk.embedding = embedding
            
            
            
            # Step 5: Store in vector database
            vector_success = self.vector_store.add_documents(chunks)
            if not vector_success:
                logger.error(f"Failed to store vectors for document: {filename}")
                return IngestionResult(
                    document_id=document_id,
                    filename=filename,
                    success=False,
                    processing_time=time.time() - start_time,
                    chunks_created=len(chunks),
                    chunks_indexed=0,
                    error_message="Failed to store document vectors"
                )
            
            # Step 6: Store metadata
            processing_time = time.time() - start_time
            metadata = DocumentMetadata(
                document_id=document_id,
                filename=filename,
                file_path=file_path,
                file_type=processed_doc.document_type.value,
                upload_timestamp=processed_doc.processing_timestamp,
                processing_status=ProcessingStatus.COMPLETED,
                total_chunks=len(chunks),
                file_size=processed_doc.file_size,
                checksum=processed_doc.checksum,
                processing_time=processing_time,
                metadata_json=self._serialize_metadata(processed_doc.metadata)
            )
            
            metadata_success = self.metadata_manager.store_document_metadata(document_id, metadata)
            if not metadata_success:
                logger.warning(f"Failed to store metadata for document: {filename}")
            
            logger.info(f"Successfully ingested document {filename}: {len(chunks)} chunks in {processing_time:.2f}s")
            
            return IngestionResult(
                document_id=document_id,
                filename=filename,
                success=True,
                processing_time=processing_time,
                chunks_created=len(chunks),
                chunks_indexed=len(chunks)
            )
            
        except Exception as e:
            error_msg = f"Ingestion failed for {filename}: {str(e)}"
            logger.error(error_msg)
            
            # Update metadata with error status
            if document_id:
                self.metadata_manager.update_document_status(
                    document_id, 
                    ProcessingStatus.FAILED, 
                    error_msg,
                    time.time() - start_time
                )
            
            return IngestionResult(
                document_id=document_id or "unknown",
                filename=filename,
                success=False,
                processing_time=time.time() - start_time,
                chunks_created=0,
                chunks_indexed=0,
                error_message=error_msg
            )
    
    def ingest_batch(self, file_paths: List[str], max_workers: Optional[int] = None) -> IngestionStats:
        """
        Ingest multiple documents in parallel.
        
        Args:
            file_paths: List of file paths to process
            max_workers: Maximum number of worker threads
            
        Returns:
            IngestionStats with batch processing results
        """
        start_time = time.time()
        max_workers = max_workers or self.max_workers
        
        logger.info(f"Starting batch ingestion of {len(file_paths)} documents with {max_workers} workers")
        
        results = []
        errors = []
        documents_by_type = {}
        
        # Process documents in parallel
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all tasks
            future_to_path = {
                executor.submit(self.ingest_document, file_path): file_path 
                for file_path in file_paths
            }
            
            # Collect results
            for future in as_completed(future_to_path):
                file_path = future_to_path[future]
                try:
                    result = future.result()
                    results.append(result)
                    
                    # Track document types
                    file_ext = Path(file_path).suffix.lower()
                    documents_by_type[file_ext] = documents_by_type.get(file_ext, 0) + 1
                    
                    if not result.success:
                        errors.append(f"{result.filename}: {result.error_message}")
                        
                except Exception as e:
                    error_msg = f"Failed to process {file_path}: {str(e)}"
                    errors.append(error_msg)
                    logger.error(error_msg)
        
        # Calculate statistics
        successful_results = [r for r in results if r.success]
        failed_results = [r for r in results if not r.success]
        
        total_processing_time = time.time() - start_time
        total_chunks = sum(r.chunks_indexed for r in successful_results)
        avg_processing_time = (
            sum(r.processing_time for r in results) / len(results) 
            if results else 0.0
        )
        
        stats = IngestionStats(
            total_documents=len(file_paths),
            successful_documents=len(successful_results),
            failed_documents=len(failed_results),
            total_chunks=total_chunks,
            total_processing_time=total_processing_time,
            average_processing_time=avg_processing_time,
            documents_by_type=documents_by_type,
            errors=errors
        )
        
        logger.info(f"Batch ingestion completed: {stats.successful_documents}/{stats.total_documents} "
                   f"documents processed successfully in {total_processing_time:.2f}s")
        
        return stats
    
    def reprocess_document(self, document_id: str) -> IngestionResult:
        """
        Reprocess an existing document.
        
        Args:
            document_id: ID of the document to reprocess
            
        Returns:
            IngestionResult with reprocessing details
        """
        # Get existing metadata
        metadata = self.metadata_manager.get_document_metadata(document_id)
        if not metadata:
            return IngestionResult(
                document_id=document_id,
                filename="unknown",
                success=False,
                processing_time=0.0,
                chunks_created=0,
                chunks_indexed=0,
                error_message="Document not found in metadata"
            )
        
        # Delete existing vectors
        self.vector_store.delete_document(document_id)
        
        # Reprocess the document
        return self.ingest_document(metadata.file_path, document_id)
    
    def delete_document(self, document_id: str) -> bool:
        """
        Delete a document and all associated data.
        
        Args:
            document_id: ID of the document to delete
            
        Returns:
            True if successful, False otherwise
        """
        try:
            # Delete from vector store
            vector_success = self.vector_store.delete_document(document_id)
            
            # Delete from metadata
            metadata_success = self.metadata_manager.delete_document(document_id)
            
            success = vector_success and metadata_success
            if success:
                logger.info(f"Successfully deleted document: {document_id}")
            else:
                logger.warning(f"Partial deletion of document: {document_id}")
            
            return success
            
        except Exception as e:
            logger.error(f"Failed to delete document {document_id}: {e}")
            return False
    
    def _process_document(self, file_path: str) -> ProcessedDocument:
        try:
            processor = DocumentProcessorFactory.create_processor(file_path, self.config)
            return processor.process_document(file_path)
            
        except Exception as e:
            logger.error(f"Document processing failed for {file_path}: {e}")
            
            # Return failed document
            document_id = self._generate_document_id(file_path)
            return ProcessedDocument(
                document_id=document_id,
                filename=Path(file_path).name,
                file_path=file_path,
                document_type=DocumentType.UNKNOWN,
                content="",
                metadata={},
                processing_status=ProcessingStatus.FAILED,
                error_message=str(e)
            )
    
    def _generate_document_id(self, file_path: str) -> str:
        # Use file path and modification time for uniqueness
        file_path_obj = Path(file_path)
        if file_path_obj.exists():
            mtime = file_path_obj.stat().st_mtime
            content = f"{file_path}_{mtime}"
        else:
            content = f"{file_path}_{time.time()}"
        
        return hashlib.md5(content.encode()).hexdigest()
    
    def _serialize_metadata(self, metadata: Dict[str, Any]) -> str:
        try:
            import json
            return json.dumps(metadata, default=str, ensure_ascii=False)
        except Exception as e:
            logger.warning(f"Failed to serialize metadata: {e}")
            return "{}"
        
        
    
    def get_pipeline_stats(self) -> Dict[str, Any]:
        """
        Get statistics about the ingestion pipeline.
        
        Returns:
            Dictionary with pipeline statistics
        """
        try:
            # Get component statistics
            vector_stats = self.vector_store.get_collection_info()
            metadata_stats = self.metadata_manager.get_statistics()
            embedding_stats = self.embedding_system.get_cache_stats()
            
            return {
                "vector_store": vector_stats.__dict__ if vector_stats else {},
                "metadata_manager": metadata_stats,
                "embedding_system": embedding_stats,
                "pipeline_config": {
                    "chunk_size": self.chunk_size,
                    "chunk_overlap": self.chunk_overlap,
                    "batch_size": self.batch_size,
                    "max_workers": self.max_workers,
                    "enable_ocr": self.enable_ocr
                }
            }
            
        except Exception as e:
            logger.error(f"Failed to get pipeline stats: {e}")
            return {"error": str(e)}
    
    def health_check(self) -> Dict[str, bool]:
        """
        Check health of all pipeline components.
        
        Returns:
            Dictionary with health status of each component
        """
        return {
            "vector_store": self.vector_store.health_check(),
            "metadata_manager": True,  # SQLite is always available if file system works
            "embedding_system": True   # Will be checked during actual usage
        }



if __name__=="__main__":
    logger.info(f"Ingestion Pipe init ..")
    
    ## Example usage
    import yaml
    with open("src/config.yaml", 'r') as f:
        config = yaml.safe_load(f)
    pipeline = DocumentIngestionPipeline(config)
    stats = pipeline.get_pipeline_stats()
    logger.info(f"Pipeline stats: {stats}")
    # Example single document ingestion
    result = pipeline.ingest_document("data/documents/3.수불확인등록.xlsx")
    logger.info(f"Ingestion result: {result}")
    # Example batch ingestion
    # batch_result = pipeline.ingest_batch(["sample_data/sample.pdf", "sample_data/sample.docx"])
    # logger.info(f"Batch ingestion stats: {batch_result}")