diff --git a/CHANGELOG.md b/CHANGELOG.md index 5905707e2..2701f96ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# 0.2.0-dev0 + +### Enhancements + +* **Added migration for GitHub Source V2** + ## 0.2.0 ### Enhancements diff --git a/requirements/connectors/github.in b/requirements/connectors/github.in index 685d0f350..7b16b92b3 100644 --- a/requirements/connectors/github.in +++ b/requirements/connectors/github.in @@ -1,5 +1,4 @@ -c ../common/constraints.txt -# NOTE - pygithub==1.58.0 fails due to https://github.com/PyGithub/PyGithub/issues/2436 -pygithub>1.58.0 +pygithub>=2.4.0 requests diff --git a/test_e2e/expected-structured-output/github/LICENSE.txt.json b/test_e2e/expected-structured-output/github/LICENSE.txt.json index 75fcab765..6572a1d69 100644 --- a/test_e2e/expected-structured-output/github/LICENSE.txt.json +++ b/test_e2e/expected-structured-output/github/LICENSE.txt.json @@ -1,57 +1,107 @@ [ { + "type": "Title", "element_id": "52585ab256e2832166ca185be6c76cc9", + "text": "Downloadify: Client Side File Creation JavaScript + Flash Library", "metadata": { - "filetype": "text/plain", "languages": [ "eng" - ] - }, - "text": "Downloadify: Client Side File Creation JavaScript + Flash Library", - "type": "Title" + ], + "filetype": "text/plain", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "permissions_data": null, + "filesize_bytes": 1127 + } + } }, { + "type": "Title", "element_id": "107ab54e7143d022fee38d5dfe235f89", + "text": "Copyright (c) 2009 Douglas C. Neiner", "metadata": { - "filetype": "text/plain", "languages": [ "eng" - ] - }, - "text": "Copyright (c) 2009 Douglas C. Neiner", - "type": "Title" + ], + "filetype": "text/plain", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "permissions_data": null, + "filesize_bytes": 1127 + } + } }, { + "type": "NarrativeText", "element_id": "1cd03f5c7eea429178fc15c9d6c4cbd4", + "text": "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:", "metadata": { - "filetype": "text/plain", "languages": [ "eng" - ] - }, - "text": "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:", - "type": "NarrativeText" + ], + "filetype": "text/plain", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "permissions_data": null, + "filesize_bytes": 1127 + } + } }, { + "type": "NarrativeText", "element_id": "5da204497a4873a8d0f71ad7865cea7e", + "text": "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.", "metadata": { - "filetype": "text/plain", "languages": [ "eng" - ] - }, - "text": "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.", - "type": "NarrativeText" + ], + "filetype": "text/plain", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "permissions_data": null, + "filesize_bytes": 1127 + } + } }, { + "type": "NarrativeText", "element_id": "1b454f06bfa94b6d367e0e812ae32655", + "text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.", "metadata": { - "filetype": "text/plain", "languages": [ "eng" - ] - }, - "text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.", - "type": "NarrativeText" + ], + "filetype": "text/plain", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "permissions_data": null, + "filesize_bytes": 1127 + } + } } ] \ No newline at end of file diff --git a/test_e2e/expected-structured-output/github/test.html.json b/test_e2e/expected-structured-output/github/test.html.json index de309dd59..3cef33562 100644 --- a/test_e2e/expected-structured-output/github/test.html.json +++ b/test_e2e/expected-structured-output/github/test.html.json @@ -1,52 +1,92 @@ [ { + "type": "Title", "element_id": "218722ac66e142a570ab2053b430c6c4", + "text": "Downloadify Example", "metadata": { - "filetype": "text/html", "languages": [ "eng" - ] - }, - "text": "Downloadify Example", - "type": "Title" + ], + "filetype": "text/html", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/c63c8fc21d46d44de85a14a3ed4baec0348ce344", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "permissions_data": null, + "filesize_bytes": 3001 + } + } }, { + "type": "Title", "element_id": "bf0fab1925c4b2cbb23a53afce28ebd2", + "text": "More info available at the Github Project Page", "metadata": { - "filetype": "text/html", - "languages": [ - "eng" - ], "link_texts": [ "Github Project Page" ], "link_urls": [ "http://github.com/dcneiner/Downloadify" - ] - }, - "text": "More info available at the Github Project Page", - "type": "Title" + ], + "languages": [ + "eng" + ], + "filetype": "text/html", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/c63c8fc21d46d44de85a14a3ed4baec0348ce344", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "permissions_data": null, + "filesize_bytes": 3001 + } + } }, { + "type": "Title", "element_id": "395aed29cd13842fede90a1a8677aa4b", + "text": "Downloadify Invoke Script For This Page", "metadata": { - "filetype": "text/html", "languages": [ "eng" - ] - }, - "text": "Downloadify Invoke Script For This Page", - "type": "Title" + ], + "filetype": "text/html", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/c63c8fc21d46d44de85a14a3ed4baec0348ce344", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "permissions_data": null, + "filesize_bytes": 3001 + } + } }, { + "type": "NarrativeText", "element_id": "2e22c39e004cb7d566294080c976efc8", + "text": "Downloadify.create('downloadify',{\n filename: function(){\n return document.getElementById('filename').value;\n },\n data: function(){ \n return document.getElementById('data').value;\n },\n onComplete: function(){ \n alert('Your File Has Been Saved!'); \n },\n onCancel: function(){ \n alert('You have cancelled the saving of this file.');\n },\n onError: function(){ \n alert('You must put something in the File Contents or there will be nothing to save!'); \n },\n swf: 'media/downloadify.swf',\n downloadImage: 'images/download.png',\n width: 100,\n height: 30,\n transparent: true,\n append: false\n});", "metadata": { - "filetype": "text/html", "languages": [ "eng" - ] - }, - "text": "Downloadify.create('downloadify',{\n filename: function(){\n return document.getElementById('filename').value;\n },\n data: function(){ \n return document.getElementById('data').value;\n },\n onComplete: function(){ \n alert('Your File Has Been Saved!'); \n },\n onCancel: function(){ \n alert('You have cancelled the saving of this file.');\n },\n onError: function(){ \n alert('You must put something in the File Contents or there will be nothing to save!'); \n },\n swf: 'media/downloadify.swf',\n downloadImage: 'images/download.png',\n width: 100,\n height: 30,\n transparent: true,\n append: false\n});", - "type": "NarrativeText" + ], + "filetype": "text/html", + "data_source": { + "url": "https://api.github.com/repos/dcneiner/Downloadify/git/blobs/c63c8fc21d46d44de85a14a3ed4baec0348ce344", + "version": "W/\"bb342a3e84a4ce514665385d7d61fb2922b0705ff23ad599a3e2d355aabe3f21\"", + "record_locator": { + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "permissions_data": null, + "filesize_bytes": 3001 + } + } } ] \ No newline at end of file diff --git a/test_e2e/src/github.sh b/test_e2e/src/github.sh index cf4d5cec3..c4b4c897b 100755 --- a/test_e2e/src/github.sh +++ b/test_e2e/src/github.sh @@ -46,14 +46,14 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ --partition-endpoint "https://api.unstructuredapp.io" \ --num-processes "$max_processes" \ --download-dir "$DOWNLOAD_DIR" \ - --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth,metadata.data_source.date_created,metadata.data_source.date_modified \ --strategy hi_res \ --preserve-downloads \ --reprocess \ --output-dir "$OUTPUT_DIR" \ --verbose \ --url dcneiner/Downloadify \ - --git-file-glob '*.html,*.txt' \ + --file-glob '*.html,*.txt' \ --work-dir "$WORK_DIR" \ $ACCESS_TOKEN_FLAGS diff --git a/test_e2e/test-dest.sh b/test_e2e/test-dest.sh index 597f29196..f4d8a7dc3 100755 --- a/test_e2e/test-dest.sh +++ b/test_e2e/test-dest.sh @@ -20,7 +20,7 @@ all_tests=( 'azure-cognitive-search.sh' 'box.sh' 'chroma.sh' - 'clarifai.sh' + # 'clarifai.sh' 'couchbase.sh' 'dropbox.sh' 'elasticsearch.sh' diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index 23894ce6c..c3224483d 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.2.0" # pragma: no cover +__version__ = "0.2.0-dev0" # pragma: no cover diff --git a/unstructured_ingest/v2/processes/connectors/__init__.py b/unstructured_ingest/v2/processes/connectors/__init__.py index eec65b95d..4f139da47 100644 --- a/unstructured_ingest/v2/processes/connectors/__init__.py +++ b/unstructured_ingest/v2/processes/connectors/__init__.py @@ -22,6 +22,8 @@ from .delta_table import delta_table_destination_entry from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry +from .github import CONNECTOR_TYPE as GITHUB_CONNECTOR_TYPE +from .github import github_source_entry from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE from .google_drive import google_drive_source_entry from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE @@ -67,6 +69,8 @@ destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry ) +add_source_entry(source_type=GITHUB_CONNECTOR_TYPE, entry=github_source_entry) + add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry) add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry) diff --git a/unstructured_ingest/v2/processes/connectors/github.py b/unstructured_ingest/v2/processes/connectors/github.py new file mode 100644 index 000000000..9a27d757a --- /dev/null +++ b/unstructured_ingest/v2/processes/connectors/github.py @@ -0,0 +1,350 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from time import time +from typing import TYPE_CHECKING, Any, Generator, Optional +from urllib.parse import urlparse + +from pydantic import Field, Secret, model_validator + +from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError +from unstructured_ingest.utils.dep_check import requires_dependencies +from unstructured_ingest.v2.interfaces import ( + AccessConfig, + ConnectionConfig, + Downloader, + DownloaderConfig, + DownloadResponse, + FileData, + FileDataSourceMetadata, + Indexer, + IndexerConfig, + SourceIdentifiers, +) +from unstructured_ingest.v2.logger import logger +from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry + +CONNECTOR_TYPE = "github" +if TYPE_CHECKING: + from github.ContentFile import ContentFile + from github.Repository import Repository + + +class GitHubAccessConfig(AccessConfig): + git_access_token: Optional[str] = Field( + default=None, + description="Optional personal access token for authenticating with the GitHub API.", + ) + + +class GitHubConnectionConfig(ConnectionConfig): + url: str = Field( + description=( + "The full URL to the GitHub project or repository, used to determine" + "the base URL and repo path." + ) + ) + + access_config: Secret[GitHubAccessConfig] = Field( + default=GitHubAccessConfig(), + validate_default=True, + description="Secret configuration for accessing the GitHub API by authentication tokens.", + ) + + branch: Optional[str] = Field( + default=None, + overload_name="git_branch", + description=( + "The branch to interact with. If not provided, the default branch for the" + " repository is used." + ), + ) + + repo_path: str = Field( + default=None, + init=False, + repr=False, + description="The normalized repository path extracted from the GitHub URL.", + ) + + @model_validator(mode="before") + def set_repo_path(cls, values: dict) -> dict: + """Parses the provided GitHub URL and sets the `repo_path` value. + + This method ensures the provided URL is valid and properly formatted, extracting + the owner and repository name as the `repo_path`. If the URL is invalid, it raises + a `ValueError`. + + Args: + values (dict): A dictionary of field values passed to the model. + + Returns: + dict: The updated dictionary of values with the `repo_path` field set. + + Raises: + ValueError: If the URL is not properly formatted or doesn't match the + expected GitHub structure. + """ + url = values.get("url") + if url: + parsed_gh_url = urlparse(url) + path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment] + + if ( + (parsed_gh_url.scheme and parsed_gh_url.scheme != "https") + or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com") + or len(path_fragments) != 2 + ): + raise ValueError( + 'Please provide a valid URL, e.g. "https://github.com/owner/repo" or ' + '"owner/repo".' + ) + + values["repo_path"] = "/".join(path_fragments) + return values + + @SourceConnectionError.wrap + @requires_dependencies(["github"], extras="github") + def get_repo(self) -> "Repository": + from github import Github + + github = Github(self.access_config.get_secret_value().git_access_token) + return github.get_repo(self.repo_path) + + +class GitHubIndexerConfig(IndexerConfig): + recursive: bool = Field( + default=False, + description=( + "Flag to control recursive operations when indexing. " + "If True, the indexer will traverse directories recursively." + ), + ) + + +@dataclass +class GitHubIndexer(Indexer): + connection_config: GitHubConnectionConfig + index_config: GitHubIndexerConfig + + def precheck(self) -> None: + """Performs a precheck to validate the connection to the GitHub repository. + + This method sends a `HEAD` request to the GitHub API to ensure the repository + is accessible and properly configured. It uses the GitHub `Requester` class + with retry support and authentication via an access token. + + Raises: + SourceConnectionError: If the connection validation fails. + """ + from github import Auth, Consts + from github.GithubRetry import GithubRetry + from github.Requester import Requester + + access_token = self.connection_config.access_config.get_secret_value().git_access_token + if not access_token: + raise AssertionError('github access token is not provided') + auth = Auth.Token(access_token) + + try: + requester = Requester( + auth=auth, + base_url=Consts.DEFAULT_BASE_URL, + timeout=Consts.DEFAULT_TIMEOUT, + user_agent=Consts.DEFAULT_USER_AGENT, + per_page=Consts.DEFAULT_PER_PAGE, + verify=True, + retry=GithubRetry(), + pool_size=None, + ) + url = f"{Consts.DEFAULT_BASE_URL}/repos/{self.connection_config.repo_path}" + logger.debug(f"Precheck Request: {url!r}") + + headers, _ = requester.requestJsonAndCheck("HEAD", url) + logger.debug(f"Headers from HEAD request: {headers}") + except Exception as e: + logger.error(f"Failed to validate connection: {e}", exc_info=True) + raise SourceConnectionError(f"Failed to validate connection: {e}") + + def run(self, **kwargs: Any) -> Generator[FileData, None, None]: + """Iterates over the GitHub repository tree, yielding `FileData` objects for all + files (blobs). + + This method retrieves the entire repository tree for the specified branch or + the default branch. + For each file (blob), it extracts relevant metadata and yields a `FileData` object. + + Args: + **kwargs (Any): Additional optional arguments. + + Yields: + FileData: An object containing metadata and identifiers for each file in the repository. + """ + repo = self.connection_config.get_repo() + sha = self.connection_config.branch or repo.default_branch + logger.info(f"Starting to look for blob files on GitHub in branch: {sha!r}") + + git_tree = repo.get_git_tree(sha, recursive=self.index_config.recursive) + + for element in git_tree.tree: + rel_path = element.path.replace(self.connection_config.repo_path, "").lstrip("/") + if element.type == "blob": + record_locator = { + "repo_path": self.connection_config.repo_path, + "file_path": element.path, + } + if self.connection_config.branch is not None: + record_locator["branch"] = self.connection_config.branch + + date_modified = datetime.strptime( + element._headers["last-modified"], + "%a, %d %b %Y %H:%M:%S %Z", + ).isoformat() + + date_created = datetime.strptime( + element._headers["date"], + "%a, %d %b %Y %H:%M:%S %Z", + ).isoformat() + + additional_metadata = {} + for metadata in ["content-type", "mode", "type", "size"]: + if metadata in element._headers: + additional_metadata[metadata] = element._headers[metadata] + + yield FileData( + identifier=element.sha, + connector_type=CONNECTOR_TYPE, + source_identifiers=SourceIdentifiers( + fullpath=element.path, + filename=element.path.split("/")[-1], + rel_path=rel_path, + ), + metadata=FileDataSourceMetadata( + url=element.url, + version=element.etag, + record_locator=record_locator, + date_modified=date_modified, + date_created=date_created, + date_processed=str(time()), + ), + additional_metadata=additional_metadata, + ) + + +class GitHubDownloaderConfig(DownloaderConfig): + pass + + +@dataclass +class GitHubDownloader(Downloader): + connection_config: GitHubConnectionConfig + download_config: GitHubDownloaderConfig + + def is_async(self) -> bool: + return True + + @requires_dependencies(["github"], extras="github") + def _fetch_file(self, path: str) -> ContentFile: + """Fetches a file from the GitHub repository using the GitHub API. + + Args: + path (str): The path to the file in the repository. + + Returns: + ContentFile: An object containing the file content. + """ + try: + logger.info(f"Fetching file from path: {path!r}") + content_file = self.connection_config.get_repo().get_contents(path) + except Exception as e: + logger.error(f"Failed to download {path}: {e}") + raise e + + return content_file + + @SourceConnectionNetworkError.wrap + @requires_dependencies(["httpx", "github"], extras="github") + async def _fetch_content(self, content_file: ContentFile) -> bytes: + """Asynchronously retrieves the content of a file, handling large files via direct download. + + Args: + content_file (ContentFile): The file object from the GitHub API. + + Returns: + bytes: The content of the file as bytes. + """ + import httpx + + contents = b"" + async with httpx.AsyncClient() as client: + if not content_file.content and content_file.encoding == "none" and content_file.size: + logger.info( + "File too large for the GitHub API, using direct download link instead." + ) + try: + response = await client.get(content_file.download_url, timeout=10.0) + response.raise_for_status() + except Exception as e: + logger.error(f"Failed to download: {e}") + raise e + + contents = response.content + else: + contents = content_file.decoded_content + return contents + + async def _fetch_and_write(self, path: str, download_path: Path) -> None: + """Fetches a file from GitHub and writes its content to the specified local path. + + Args: + path (str): The path to the file in the repository. + download_path (Path): The local path to save the downloaded file. + + Raises: + ValueError: If the file content could not be retrieved. + """ + content_file = self._fetch_file(path) + contents = await self._fetch_content(content_file) + if contents is None: + raise ValueError( + f"Failed to retrieve file from repo " + f"{self.connection_config.url}/{path}. Check logs", + ) + + logger.info(f"Writing download file to path: {download_path!r}") + with download_path.open("wb") as f: + f.write(contents) + + def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: + # Synchronous run is not implemented + raise NotImplementedError() + + async def run_async(self, file_data: FileData, **kwargs: Any) -> DownloadResponse: + """Asynchronously downloads a file from the GitHub repository and returns a + `DownloadResponse`. + + Args: + file_data (FileData): Metadata about the file to be downloaded. + **kwargs (Any): Additional optional arguments. + + Returns: + DownloadResponse: A response containing the details of the download. + """ + download_path = self.get_download_path(file_data=file_data) + download_path.parent.mkdir(parents=True, exist_ok=True) + + path = file_data.source_identifiers.fullpath + await self._fetch_and_write(path, download_path) + + return self.generate_download_response(file_data=file_data, download_path=download_path) + + +github_source_entry = SourceRegistryEntry( + connection_config=GitHubConnectionConfig, + indexer_config=GitHubIndexerConfig, + indexer=GitHubIndexer, + downloader_config=GitHubDownloaderConfig, + downloader=GitHubDownloader, +)