support (1) extract a pdf file and save into a string (2) extract and chunk pdf files

shuoweil · shuoweil · commit 80afa099ba92 · 2025-02-07T05:01:34.000Z
diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py
@@ -130,7 +130,8 @@ def image_blur_func(
 image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"])
 
 
-def pdf_chunk_func(src_obj_ref_rt: str) -> str:
+# Extracts all text from a PDF
+def pdf_extract_func(src_obj_ref_rt: str) -> str:
     import io
     import json
 
@@ -147,13 +148,56 @@ def pdf_chunk_func(src_obj_ref_rt: str) -> str:
     pdf_file = io.BytesIO(pdf_bytes)
     reader = PdfReader(pdf_file, strict=False)
 
-    all_text = []
+    all_text = ""
     for page in reader.pages:
         page_extract_text = page.extract_text()
         if page_extract_text:
-            all_text.append(page_extract_text)
+            all_text += page_extract_text
+    return all_text
 
-    all_text_json_string = json.dumps(all_text)
+
+pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests"])
+
+
+# Chunks the text from a PDF
+def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> str:
+    import io
+    import json
+
+    from pypdf import PdfReader  # type: ignore
+    import requests
+
+    if overlap_size >= chunk_size:
+        raise ValueError("overlap_size must be smaller than chunk_size.")
+    if chunk_size <= 0:
+        raise ValueError("chunk_size must be positive.")
+    if overlap_size <= 0:
+        raise ValueError("overlap_size must be positive.")
+
+    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)
+    src_url = src_obj_ref_rt_json["access_urls"]["read_url"]
+
+    response = requests.get(src_url, stream=True)
+    response.raise_for_status()
+    pdf_bytes = response.content
+
+    pdf_file = io.BytesIO(pdf_bytes)
+    reader = PdfReader(pdf_file, strict=False)
+
+    all_text_str = ""
+    for page in reader.pages:
+        page_extract_text = page.extract_text()
+        if page_extract_text:
+            all_text_str += page_extract_text
+
+    all_text_chunks = []
+    start = 0
+    while start < len(all_text_str):
+        end = min(start + chunk_size, len(all_text_str))
+        all_text_chunks.append(all_text_str[start:end])
+        start += chunk_size - overlap_size
+
+    all_text_json_string = json.dumps(all_text_chunks)
 
     return all_text_json_string
 
diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import os
-from typing import cast, Literal, Optional, Union
+from typing import cast, Optional, Union
 
 import IPython.display as ipy_display
 import requests
@@ -314,28 +314,59 @@ def image_blur(
 
         return dst
 
+    def pdf_extract(self, *, connection: Optional[str] = None) -> list:
+        """Extracts and chunks text from PDF files and saves the text as
+           array of string.
+
+        Args:
+            connection (str or None, default None): BQ connection used for
+                function internet transactions, and the output blob if "dst"
+                is str. If None, uses default connection of the session.
+
+        Returns:
+            str: conatins all text from a pdf file
+        """
+
+        import bigframes.blob._functions as blob_func
+
+        connection = self._resolve_connection(connection)
+
+        pdf_chunk_udf = blob_func.TransformFunction(
+            blob_func.pdf_extract_def,
+            session=self._block.session,
+            connection=connection,
+        ).udf()
+
+        src_rt = self._get_runtime_json_str(mode="R")
+        res = src_rt.apply(pdf_chunk_udf)
+
+        return res
+
     def pdf_chunk(
         self,
         *,
-        dst_table: str,
         connection: Optional[str] = None,
-        if_exists: Literal["fail", "replace", "append"] = "replace",
+        chunk_size: int = 1000,
+        overlap_size: int = 200,
     ) -> list:
         """Extracts and chunks text from PDF files and saves the text as
-            array of string.
+           array of string.
 
         Args:
-            dst_table (str): Destination Bigquery table (project.dataset.table).
             connection (str or None, default None): BQ connection used for
                 function internet transactions, and the output blob if "dst"
                 is str. If None, uses default connection of the session.
-            if_exists (Literal["fail", "replace", "append"], default "replace"):
-                What to do if the table exists.
+            chunk_size (int, default 1000): the desired size of each text chunk
+                (number of characters).
+            overlap_size (int, default 200): the number of overlapping characters
+                between consective chunks. The helps to ensure context is
+                perserved across chunk boundaries.
 
         Returns:
             list: A list of strings, where each string is a chunk of text extracted
                 from the PDFs.
         """
+
         import bigframes.bigquery as bbq
         import bigframes.blob._functions as blob_func
         import bigframes.pandas as bpd
@@ -349,17 +380,13 @@ def pdf_chunk(
         ).udf()
 
         src_rt = self._get_runtime_json_str(mode="R")
-
         df = src_rt.to_frame()
+        df["chunk_size"] = chunk_size
+        df["overlap_size"] = overlap_size
 
         res = df.apply(pdf_chunk_udf, axis=1)
-        res_bf = bpd.Series(res)
-        res_df = bpd.DataFrame({"text_array": bbq.json_extract_string_array(res_bf)})
-
-        res_df.to_gbq(
-            destination_table=dst_table,
-            if_exists=if_exists,
-        )
+        res.cache()  # to execute the udf
 
-        text_array = res_df["text_array"].tolist()
-        return text_array
+        res_bf = bpd.Series(res)
+        res_list = bbq.json_extract_string_array(res_bf).tolist()
+        return res_list