11import logging
2+ import os
23import subprocess
34import time
45from collections .abc import Generator
6+ from datetime import datetime
57from fnmatch import fnmatch as fnmatch_path
68from logging import Filter
79from pathlib import Path
@@ -176,7 +178,22 @@ def add_documents(self, documents: list[Document], batch_size: int = 10) -> None
176178 documents: List of documents to add
177179 batch_size: Number of documents to process in each batch
178180 """
179- list (self .add_documents_progress (documents , batch_size = batch_size ))
181+ # Process documents in batches
182+ for i in range (0 , len (documents ), batch_size ):
183+ batch = documents [i : i + batch_size ]
184+ self ._add_documents (batch )
185+
186+ # Update stored timestamps after successful indexing
187+ for doc in batch :
188+ if "source" in doc .metadata :
189+ abs_path = str (Path (doc .metadata ["source" ]).resolve ())
190+ current_mtime = int (os .path .getmtime (abs_path ))
191+ doc .metadata ["last_modified" ] = current_mtime
192+ # Update the document in the collection
193+ self .collection .update (
194+ ids = [doc .doc_id ],
195+ metadatas = [doc .metadata ],
196+ )
180197
181198 def add_documents_progress (
182199 self , documents : list [Document ], batch_size : int = 10
@@ -201,6 +218,14 @@ def _add_documents(self, documents: list[Document]) -> None:
201218 doc = self ._generate_doc_id (doc )
202219 assert doc .doc_id is not None
203220
221+ # Update timestamp in metadata to current time
222+ if "source" in doc .metadata :
223+ abs_path = str (Path (doc .metadata ["source" ]).resolve ())
224+ current_mtime = os .path .getmtime (abs_path )
225+ doc .metadata ["last_modified" ] = self ._normalize_timestamp (
226+ current_mtime
227+ )
228+
204229 contents .append (doc .content )
205230 metadatas .append (doc .metadata )
206231 ids .append (doc .doc_id )
@@ -869,14 +894,54 @@ def _get_valid_files(
869894
870895 return valid_files
871896
897+ def _normalize_timestamp (self , timestamp : str | float | int | None ) -> str :
898+ """Normalize timestamp to ISO format string."""
899+ if timestamp is None :
900+ return datetime .fromtimestamp (0 ).isoformat ()
901+ try :
902+ if isinstance (timestamp , int | float ):
903+ return datetime .fromtimestamp (float (timestamp )).isoformat ()
904+ # If it's already an ISO string, validate and return
905+ if isinstance (timestamp , str ):
906+ datetime .fromisoformat (timestamp ) # Validate format
907+ return timestamp
908+ raise ValueError (f"Unsupported timestamp type: { type (timestamp )} " )
909+ except (ValueError , TypeError ) as e :
910+ logger .warning ("Invalid timestamp format: %s (%s)" , timestamp , e )
911+ return datetime .fromtimestamp (0 ).isoformat ()
912+
913+ def _compare_timestamps (self , stored : str , current : float ) -> bool :
914+ """Compare stored ISO timestamp with current Unix timestamp.
915+
916+ Returns True if current is newer than stored."""
917+ try :
918+ stored_ts = datetime .fromisoformat (stored ).timestamp ()
919+ # Round to seconds for comparison
920+ return int (current ) > int (stored_ts )
921+ except (ValueError , TypeError ) as e :
922+ logger .warning ("Error comparing timestamps: %s" , e )
923+ return True # If we can't compare, assume modified
924+
925+ def _get_stored_timestamps (self ) -> dict [str , str ]:
926+ """Get stored timestamps for all indexed files."""
927+ stored = {}
928+ for doc in self .get_all_documents ():
929+ if "source" in doc .metadata :
930+ abs_path = str (Path (doc .metadata ["source" ]).resolve ())
931+ timestamp = self ._normalize_timestamp (doc .metadata .get ("last_modified" ))
932+ stored [abs_path ] = timestamp
933+ logger .debug ("Stored timestamp for %s: %s" , abs_path , timestamp )
934+ return stored
935+
872936 def collect_documents (
873- self , path : Path , glob_pattern : str = "**/*.*"
937+ self , path : Path , glob_pattern : str = "**/*.*" , check_modified : bool = True
874938 ) -> list [Document ]:
875939 """Collect documents from a file or directory without processing them.
876940
877941 Args:
878942 path: Path to collect documents from
879943 glob_pattern: Pattern to match files (only used for directories)
944+ check_modified: Whether to check for modifications (skip unchanged files)
880945
881946 Returns:
882947 List of documents ready for processing
@@ -888,8 +953,33 @@ def collect_documents(
888953 logger .debug (f"No valid files found in { path } " )
889954 return documents
890955
956+ if check_modified :
957+ stored_timestamps = self ._get_stored_timestamps ()
958+
891959 # Process files in order (least deep first)
892960 for file_path in sorted (valid_files , key = lambda x : len (x .parts )):
961+ abs_path = str (file_path .resolve ())
962+
963+ if check_modified :
964+ current_mtime = os .path .getmtime (file_path )
965+ stored_timestamp = stored_timestamps .get (abs_path )
966+
967+ if stored_timestamp and not self ._compare_timestamps (
968+ stored_timestamp , current_mtime
969+ ):
970+ logger .debug ("Skipping unchanged file: %s" , abs_path )
971+ continue
972+
973+ if not stored_timestamp :
974+ logger .debug ("New file: %s" , abs_path )
975+ else :
976+ logger .debug (
977+ "Modified file: %s (current: %s, stored: %s)" ,
978+ abs_path ,
979+ self ._normalize_timestamp (current_mtime ),
980+ stored_timestamp ,
981+ )
982+
893983 logger .debug (f"Processing { file_path } " )
894984 documents .extend (Document .from_file (file_path , processor = self .processor ))
895985
@@ -918,6 +1008,4 @@ def get_all_documents(self) -> list[Document]:
9181008 """
9191009 logger .debug ("Getting all documents from index" )
9201010 docs = self .list_documents (group_by_source = False )
921- for doc in docs :
922- logger .debug ("Retrieved document with metadata: %s" , doc .metadata )
9231011 return docs
0 commit comments