1515from __future__ import annotations
1616
1717import os
18- from typing import cast , Literal , Optional , Union
18+ from typing import cast , Optional , Union
1919
2020import IPython .display as ipy_display
2121import requests
@@ -314,28 +314,59 @@ def image_blur(
314314
315315 return dst
316316
317+ def pdf_extract (self , * , connection : Optional [str ] = None ) -> list :
318+ """Extracts and chunks text from PDF files and saves the text as
319+ array of string.
320+
321+ Args:
322+ connection (str or None, default None): BQ connection used for
323+ function internet transactions, and the output blob if "dst"
324+ is str. If None, uses default connection of the session.
325+
326+ Returns:
327+ str: conatins all text from a pdf file
328+ """
329+
330+ import bigframes .blob ._functions as blob_func
331+
332+ connection = self ._resolve_connection (connection )
333+
334+ pdf_chunk_udf = blob_func .TransformFunction (
335+ blob_func .pdf_extract_def ,
336+ session = self ._block .session ,
337+ connection = connection ,
338+ ).udf ()
339+
340+ src_rt = self ._get_runtime_json_str (mode = "R" )
341+ res = src_rt .apply (pdf_chunk_udf )
342+
343+ return res
344+
317345 def pdf_chunk (
318346 self ,
319347 * ,
320- dst_table : str ,
321348 connection : Optional [str ] = None ,
322- if_exists : Literal ["fail" , "replace" , "append" ] = "replace" ,
349+ chunk_size : int = 1000 ,
350+ overlap_size : int = 200 ,
323351 ) -> list :
324352 """Extracts and chunks text from PDF files and saves the text as
325- array of string.
353+ array of string.
326354
327355 Args:
328- dst_table (str): Destination Bigquery table (project.dataset.table).
329356 connection (str or None, default None): BQ connection used for
330357 function internet transactions, and the output blob if "dst"
331358 is str. If None, uses default connection of the session.
332- if_exists (Literal["fail", "replace", "append"], default "replace"):
333- What to do if the table exists.
359+ chunk_size (int, default 1000): the desired size of each text chunk
360+ (number of characters).
361+ overlap_size (int, default 200): the number of overlapping characters
362+ between consective chunks. The helps to ensure context is
363+ perserved across chunk boundaries.
334364
335365 Returns:
336366 list: A list of strings, where each string is a chunk of text extracted
337367 from the PDFs.
338368 """
369+
339370 import bigframes .bigquery as bbq
340371 import bigframes .blob ._functions as blob_func
341372 import bigframes .pandas as bpd
@@ -349,17 +380,13 @@ def pdf_chunk(
349380 ).udf ()
350381
351382 src_rt = self ._get_runtime_json_str (mode = "R" )
352-
353383 df = src_rt .to_frame ()
384+ df ["chunk_size" ] = chunk_size
385+ df ["overlap_size" ] = overlap_size
354386
355387 res = df .apply (pdf_chunk_udf , axis = 1 )
356- res_bf = bpd .Series (res )
357- res_df = bpd .DataFrame ({"text_array" : bbq .json_extract_string_array (res_bf )})
358-
359- res_df .to_gbq (
360- destination_table = dst_table ,
361- if_exists = if_exists ,
362- )
388+ res .cache () # to execute the udf
363389
364- text_array = res_df ["text_array" ].tolist ()
365- return text_array
390+ res_bf = bpd .Series (res )
391+ res_list = bbq .json_extract_string_array (res_bf ).tolist ()
392+ return res_list
0 commit comments