Skip to content

Commit 80afa09

Browse files
committed
support (1) extract a pdf file and save into a string (2) extract and chunk pdf files
1 parent 313f4af commit 80afa09

File tree

2 files changed

+92
-21
lines changed

2 files changed

+92
-21
lines changed

bigframes/blob/_functions.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ def image_blur_func(
130130
image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"])
131131

132132

133-
def pdf_chunk_func(src_obj_ref_rt: str) -> str:
133+
# Extracts all text from a PDF
134+
def pdf_extract_func(src_obj_ref_rt: str) -> str:
134135
import io
135136
import json
136137

@@ -147,13 +148,56 @@ def pdf_chunk_func(src_obj_ref_rt: str) -> str:
147148
pdf_file = io.BytesIO(pdf_bytes)
148149
reader = PdfReader(pdf_file, strict=False)
149150

150-
all_text = []
151+
all_text = ""
151152
for page in reader.pages:
152153
page_extract_text = page.extract_text()
153154
if page_extract_text:
154-
all_text.append(page_extract_text)
155+
all_text += page_extract_text
156+
return all_text
155157

156-
all_text_json_string = json.dumps(all_text)
158+
159+
pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests"])
160+
161+
162+
# Chunks the text from a PDF
163+
def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> str:
164+
import io
165+
import json
166+
167+
from pypdf import PdfReader # type: ignore
168+
import requests
169+
170+
if overlap_size >= chunk_size:
171+
raise ValueError("overlap_size must be smaller than chunk_size.")
172+
if chunk_size <= 0:
173+
raise ValueError("chunk_size must be positive.")
174+
if overlap_size <= 0:
175+
raise ValueError("overlap_size must be positive.")
176+
177+
src_obj_ref_rt_json = json.loads(src_obj_ref_rt)
178+
src_url = src_obj_ref_rt_json["access_urls"]["read_url"]
179+
180+
response = requests.get(src_url, stream=True)
181+
response.raise_for_status()
182+
pdf_bytes = response.content
183+
184+
pdf_file = io.BytesIO(pdf_bytes)
185+
reader = PdfReader(pdf_file, strict=False)
186+
187+
all_text_str = ""
188+
for page in reader.pages:
189+
page_extract_text = page.extract_text()
190+
if page_extract_text:
191+
all_text_str += page_extract_text
192+
193+
all_text_chunks = []
194+
start = 0
195+
while start < len(all_text_str):
196+
end = min(start + chunk_size, len(all_text_str))
197+
all_text_chunks.append(all_text_str[start:end])
198+
start += chunk_size - overlap_size
199+
200+
all_text_json_string = json.dumps(all_text_chunks)
157201

158202
return all_text_json_string
159203

bigframes/operations/blob.py

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from __future__ import annotations
1616

1717
import os
18-
from typing import cast, Literal, Optional, Union
18+
from typing import cast, Optional, Union
1919

2020
import IPython.display as ipy_display
2121
import requests
@@ -314,28 +314,59 @@ def image_blur(
314314

315315
return dst
316316

317+
def pdf_extract(self, *, connection: Optional[str] = None) -> list:
318+
"""Extracts and chunks text from PDF files and saves the text as
319+
array of string.
320+
321+
Args:
322+
connection (str or None, default None): BQ connection used for
323+
function internet transactions, and the output blob if "dst"
324+
is str. If None, uses default connection of the session.
325+
326+
Returns:
327+
str: conatins all text from a pdf file
328+
"""
329+
330+
import bigframes.blob._functions as blob_func
331+
332+
connection = self._resolve_connection(connection)
333+
334+
pdf_chunk_udf = blob_func.TransformFunction(
335+
blob_func.pdf_extract_def,
336+
session=self._block.session,
337+
connection=connection,
338+
).udf()
339+
340+
src_rt = self._get_runtime_json_str(mode="R")
341+
res = src_rt.apply(pdf_chunk_udf)
342+
343+
return res
344+
317345
def pdf_chunk(
318346
self,
319347
*,
320-
dst_table: str,
321348
connection: Optional[str] = None,
322-
if_exists: Literal["fail", "replace", "append"] = "replace",
349+
chunk_size: int = 1000,
350+
overlap_size: int = 200,
323351
) -> list:
324352
"""Extracts and chunks text from PDF files and saves the text as
325-
array of string.
353+
array of string.
326354
327355
Args:
328-
dst_table (str): Destination Bigquery table (project.dataset.table).
329356
connection (str or None, default None): BQ connection used for
330357
function internet transactions, and the output blob if "dst"
331358
is str. If None, uses default connection of the session.
332-
if_exists (Literal["fail", "replace", "append"], default "replace"):
333-
What to do if the table exists.
359+
chunk_size (int, default 1000): the desired size of each text chunk
360+
(number of characters).
361+
overlap_size (int, default 200): the number of overlapping characters
362+
between consective chunks. The helps to ensure context is
363+
perserved across chunk boundaries.
334364
335365
Returns:
336366
list: A list of strings, where each string is a chunk of text extracted
337367
from the PDFs.
338368
"""
369+
339370
import bigframes.bigquery as bbq
340371
import bigframes.blob._functions as blob_func
341372
import bigframes.pandas as bpd
@@ -349,17 +380,13 @@ def pdf_chunk(
349380
).udf()
350381

351382
src_rt = self._get_runtime_json_str(mode="R")
352-
353383
df = src_rt.to_frame()
384+
df["chunk_size"] = chunk_size
385+
df["overlap_size"] = overlap_size
354386

355387
res = df.apply(pdf_chunk_udf, axis=1)
356-
res_bf = bpd.Series(res)
357-
res_df = bpd.DataFrame({"text_array": bbq.json_extract_string_array(res_bf)})
358-
359-
res_df.to_gbq(
360-
destination_table=dst_table,
361-
if_exists=if_exists,
362-
)
388+
res.cache() # to execute the udf
363389

364-
text_array = res_df["text_array"].tolist()
365-
return text_array
390+
res_bf = bpd.Series(res)
391+
res_list = bbq.json_extract_string_array(res_bf).tolist()
392+
return res_list

0 commit comments

Comments
 (0)