diff --git a/src/datacustomcode/cli.py b/src/datacustomcode/cli.py index 0876ce3..a7ac5d6 100644 --- a/src/datacustomcode/cli.py +++ b/src/datacustomcode/cli.py @@ -145,18 +145,36 @@ def deploy( @cli.command() @click.argument("directory", default=".") -def init(directory: str): - from datacustomcode.scan import dc_config_json_from_file - from datacustomcode.template import copy_template +@click.option( + "--code-type", default="script", type=click.Choice(["script", "function"]) +) +def init(directory: str, code_type: str): + from datacustomcode.scan import ( + dc_config_json_from_file, + update_config, + write_sdk_config, + ) + from datacustomcode.template import copy_function_template, copy_script_template click.echo("Copying template to " + click.style(directory, fg="blue", bold=True)) - copy_template(directory) + if code_type == "script": + copy_script_template(directory) + elif code_type == "function": + copy_function_template(directory) entrypoint_path = os.path.join(directory, "payload", "entrypoint.py") config_location = os.path.join(os.path.dirname(entrypoint_path), "config.json") - config_json = dc_config_json_from_file(entrypoint_path) + + # Write package type to SDK-specific config + sdk_config = {"type": code_type} + write_sdk_config(directory, sdk_config) + + config_json = dc_config_json_from_file(entrypoint_path, code_type) with open(config_location, "w") as f: json.dump(config_json, f, indent=2) + updated_config_json = update_config(entrypoint_path) + with open(config_location, "w") as f: + json.dump(updated_config_json, f, indent=2) click.echo( "Start developing by updating the code in " + click.style(entrypoint_path, fg="blue", bold=True) @@ -176,7 +194,7 @@ def init(directory: str): "--no-requirements", is_flag=True, help="Skip generating requirements.txt file" ) def scan(filename: str, config: str, dry_run: bool, no_requirements: bool): - from datacustomcode.scan import dc_config_json_from_file, write_requirements_file + from datacustomcode.scan import update_config, write_requirements_file config_location = config or os.path.join(os.path.dirname(filename), "config.json") click.echo( @@ -184,7 +202,7 @@ def scan(filename: str, config: str, dry_run: bool, no_requirements: bool): + click.style(config_location, fg="blue", bold=True) ) click.echo("Scanning " + click.style(filename, fg="blue", bold=True) + "...") - config_json = dc_config_json_from_file(filename) + config_json = update_config(filename) click.secho(json.dumps(config_json, indent=2), fg="yellow") if not dry_run: diff --git a/src/datacustomcode/scan.py b/src/datacustomcode/scan.py index c1e4dc1..ce15108 100644 --- a/src/datacustomcode/scan.py +++ b/src/datacustomcode/scan.py @@ -16,7 +16,6 @@ import ast import json -import logging import os import sys from typing import ( @@ -27,26 +26,109 @@ Union, ) +from loguru import logger import pydantic from datacustomcode.version import get_version -logger = logging.getLogger(__name__) - DATA_ACCESS_METHODS = ["read_dlo", "read_dmo", "write_to_dlo", "write_to_dmo"] DATA_TRANSFORM_CONFIG_TEMPLATE = { "sdkVersion": get_version(), "entryPoint": "", - "dataspace": "", + "dataspace": "default", "permissions": { "read": {}, "write": {}, }, } +FUNCTION_CONFIG_TEMPLATE = { + "sdkVersion": get_version(), + "entryPoint": "", +} STANDARD_LIBS = set(sys.stdlib_module_names) +SDK_CONFIG_DIR = ".datacustomcode_proj" +SDK_CONFIG_FILE = "sdk_config.json" + + +def get_sdk_config_path(base_directory: str) -> str: + """Get the path to the SDK-specific config file. + + Args: + base_directory: The base directory of the project + (where .datacustomcode should be) + + Returns: + The path to the SDK config file + """ + sdk_config_dir = os.path.join(base_directory, SDK_CONFIG_DIR) + return os.path.join(sdk_config_dir, SDK_CONFIG_FILE) + + +def read_sdk_config(base_directory: str) -> dict[str, Any]: + """Read the SDK-specific config file. + + Args: + base_directory: The base directory of the project + + Returns: + The SDK config dictionary, or empty dict if file doesn't exist + """ + config_path = get_sdk_config_path(base_directory) + if os.path.exists(config_path) and os.path.isfile(config_path): + try: + with open(config_path, "r") as f: + config_data: dict[str, Any] = json.load(f) + return config_data + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse JSON from {config_path}: {e}") from e + except OSError as e: + raise OSError(f"Failed to read SDK config file {config_path}: {e}") from e + else: + raise FileNotFoundError(f"SDK config file not found at {config_path}") + + +def write_sdk_config(base_directory: str, config: dict[str, Any]) -> None: + """Write the SDK-specific config file. + + Args: + base_directory: The base directory of the project + config: The config dictionary to write + """ + config_path = get_sdk_config_path(base_directory) + sdk_config_dir = os.path.dirname(config_path) + os.makedirs(sdk_config_dir, exist_ok=True) + with open(config_path, "w") as f: + json.dump(config, f, indent=2) + + +def get_package_type(base_directory: str) -> str: + """Get the package type (script or function) from SDK config. + + Args: + base_directory: The base directory of the project + + Returns: + The package type ("script" or "function") + + Raises: + ValueError: If the type is not found in the SDK config + """ + try: + sdk_config = read_sdk_config(base_directory) + except FileNotFoundError as e: + logger.debug(f"Defaulting to script package type. {e}") + return "script" + if "type" not in sdk_config: + config_path = get_sdk_config_path(base_directory) + raise ValueError( + f"Package type not found in SDK config at {config_path}. " + "Please run 'datacustomcode init' to initialize the project." + ) + return str(sdk_config["type"]) + class DataAccessLayerCalls(pydantic.BaseModel): read_dlo: frozenset[str] @@ -230,37 +312,50 @@ def scan_file(file_path: str) -> DataAccessLayerCalls: return visitor.found() -def dc_config_json_from_file(file_path: str) -> dict[str, Any]: +def dc_config_json_from_file(file_path: str, type: str) -> dict[str, Any]: """Create a Data Cloud Custom Code config JSON from a script.""" - output = scan_file(file_path) - config = DATA_TRANSFORM_CONFIG_TEMPLATE.copy() + config: dict[str, Any] + if type == "script": + config = DATA_TRANSFORM_CONFIG_TEMPLATE.copy() + elif type == "function": + config = FUNCTION_CONFIG_TEMPLATE.copy() config["entryPoint"] = file_path.rpartition("/")[-1] + return config + + +def find_base_directory(file_path: str) -> str: + """Find the base directory containing .datacustomcode by walking up from file_path. + + Args: + file_path: Path to a file in the project + + Returns: + The base directory path, or the directory containing the file if not found + """ + current_dir = os.path.dirname(os.path.abspath(file_path)) + root = os.path.abspath(os.sep) + while current_dir != root: + if os.path.exists(os.path.join(current_dir, SDK_CONFIG_DIR)): + return current_dir + current_dir = os.path.dirname(current_dir) + + # If not found, assume the payload directory's parent is the base + # (payload/entrypoint.py -> base directory is parent of payload) + file_dir = os.path.dirname(os.path.abspath(file_path)) + if os.path.basename(file_dir) == "payload": + return os.path.dirname(file_dir) + return file_dir + + +def update_config(file_path: str) -> dict[str, Any]: file_dir = os.path.dirname(file_path) config_json_path = os.path.join(file_dir, "config.json") - + existing_config: dict[str, Any] if os.path.exists(config_json_path) and os.path.isfile(config_json_path): try: with open(config_json_path, "r") as f: existing_config = json.load(f) - - if "dataspace" in existing_config: - dataspace_value = existing_config["dataspace"] - if not dataspace_value or ( - isinstance(dataspace_value, str) and dataspace_value.strip() == "" - ): - logger.warning( - f"dataspace in {config_json_path} is empty or None. " - f"Updating config file to use dataspace 'default'. " - ) - config["dataspace"] = "default" - else: - config["dataspace"] = dataspace_value - else: - raise ValueError( - f"dataspace must be defined in {config_json_path}. " - f"Please add a 'dataspace' field to the config.json file. " - ) except json.JSONDecodeError as e: raise ValueError( f"Failed to parse JSON from {config_json_path}: {e}" @@ -268,19 +363,45 @@ def dc_config_json_from_file(file_path: str) -> dict[str, Any]: except OSError as e: raise OSError(f"Failed to read config file {config_json_path}: {e}") from e else: - config["dataspace"] = "default" - - read: dict[str, list[str]] = {} - if output.read_dlo: - read["dlo"] = list(output.read_dlo) - else: - read["dmo"] = list(output.read_dmo) - write: dict[str, list[str]] = {} - if output.write_to_dlo: - write["dlo"] = list(output.write_to_dlo) + raise ValueError(f"config.json not found at {config_json_path}") + + # Get package type from SDK config + base_directory = find_base_directory(file_path) + package_type = get_package_type(base_directory) + + if package_type == "script": + existing_config["dataspace"] = get_dataspace(existing_config) + output = scan_file(file_path) + read: dict[str, list[str]] = {} + if output.read_dlo: + read["dlo"] = list(output.read_dlo) + else: + read["dmo"] = list(output.read_dmo) + write: dict[str, list[str]] = {} + if output.write_to_dlo: + write["dlo"] = list(output.write_to_dlo) + else: + write["dmo"] = list(output.write_to_dmo) + + existing_config["permissions"] = {"read": read, "write": write} + return existing_config + + +def get_dataspace(existing_config: dict[str, str]) -> str: + if "dataspace" in existing_config: + dataspace_value = existing_config["dataspace"] + if not dataspace_value or ( + isinstance(dataspace_value, str) and dataspace_value.strip() == "" + ): + logger.warning( + "dataspace is empty or None. " + "Updating config file to use dataspace 'default'. " + ) + return "default" + else: + return dataspace_value else: - write["dmo"] = list(output.write_to_dmo) - - config["permissions"] = {"read": read, "write": write} - - return config + raise ValueError( + "dataspace must be defined. " + "Please add a 'dataspace' field to the config.json file. " + ) diff --git a/src/datacustomcode/template.py b/src/datacustomcode/template.py index 86c7ac8..195d4a2 100644 --- a/src/datacustomcode/template.py +++ b/src/datacustomcode/template.py @@ -17,15 +17,31 @@ from loguru import logger -template_dir = os.path.join(os.path.dirname(__file__), "templates") +script_template_dir = os.path.join(os.path.dirname(__file__), "templates", "script") +function_template_dir = os.path.join(os.path.dirname(__file__), "templates", "function") -def copy_template(target_dir: str) -> None: +def copy_script_template(target_dir: str) -> None: """Copy the template to the target directory.""" os.makedirs(target_dir, exist_ok=True) - for item in os.listdir(template_dir): - source = os.path.join(template_dir, item) + for item in os.listdir(script_template_dir): + source = os.path.join(script_template_dir, item) + destination = os.path.join(target_dir, item) + + if os.path.isdir(source): + logger.debug(f"Copying directory {source} to {destination}...") + shutil.copytree(source, destination, dirs_exist_ok=True) + else: + logger.debug(f"Copying file {source} to {destination}...") + shutil.copy2(source, destination) + + +def copy_function_template(target_dir: str) -> None: + os.makedirs(target_dir, exist_ok=True) + + for item in os.listdir(function_template_dir): + source = os.path.join(function_template_dir, item) destination = os.path.join(target_dir, item) if os.path.isdir(source): diff --git a/src/datacustomcode/templates/.devcontainer/devcontainer.json b/src/datacustomcode/templates/function/.devcontainer/devcontainer.json similarity index 100% rename from src/datacustomcode/templates/.devcontainer/devcontainer.json rename to src/datacustomcode/templates/function/.devcontainer/devcontainer.json diff --git a/src/datacustomcode/templates/Dockerfile b/src/datacustomcode/templates/function/Dockerfile similarity index 100% rename from src/datacustomcode/templates/Dockerfile rename to src/datacustomcode/templates/function/Dockerfile diff --git a/src/datacustomcode/templates/Dockerfile.dependencies b/src/datacustomcode/templates/function/Dockerfile.dependencies similarity index 100% rename from src/datacustomcode/templates/Dockerfile.dependencies rename to src/datacustomcode/templates/function/Dockerfile.dependencies diff --git a/src/datacustomcode/templates/README.md b/src/datacustomcode/templates/function/README.md similarity index 100% rename from src/datacustomcode/templates/README.md rename to src/datacustomcode/templates/function/README.md diff --git a/src/datacustomcode/templates/account.ipynb b/src/datacustomcode/templates/function/account.ipynb similarity index 100% rename from src/datacustomcode/templates/account.ipynb rename to src/datacustomcode/templates/function/account.ipynb diff --git a/src/datacustomcode/templates/build_native_dependencies.sh b/src/datacustomcode/templates/function/build_native_dependencies.sh similarity index 100% rename from src/datacustomcode/templates/build_native_dependencies.sh rename to src/datacustomcode/templates/function/build_native_dependencies.sh diff --git a/src/datacustomcode/templates/examples/employee_hierarchy/employee_data.csv b/src/datacustomcode/templates/function/examples/employee_hierarchy/employee_data.csv similarity index 100% rename from src/datacustomcode/templates/examples/employee_hierarchy/employee_data.csv rename to src/datacustomcode/templates/function/examples/employee_hierarchy/employee_data.csv diff --git a/src/datacustomcode/templates/examples/employee_hierarchy/entrypoint.py b/src/datacustomcode/templates/function/examples/employee_hierarchy/entrypoint.py similarity index 100% rename from src/datacustomcode/templates/examples/employee_hierarchy/entrypoint.py rename to src/datacustomcode/templates/function/examples/employee_hierarchy/entrypoint.py diff --git a/src/datacustomcode/templates/jupyterlab.sh b/src/datacustomcode/templates/function/jupyterlab.sh similarity index 100% rename from src/datacustomcode/templates/jupyterlab.sh rename to src/datacustomcode/templates/function/jupyterlab.sh diff --git a/src/datacustomcode/templates/function/payload/config.json b/src/datacustomcode/templates/function/payload/config.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/src/datacustomcode/templates/function/payload/config.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/src/datacustomcode/templates/function/payload/entrypoint.py b/src/datacustomcode/templates/function/payload/entrypoint.py new file mode 100644 index 0000000..a2b6892 --- /dev/null +++ b/src/datacustomcode/templates/function/payload/entrypoint.py @@ -0,0 +1,119 @@ +import logging +from typing import List +from uuid import uuid4 + +logger = logging.getLogger(__name__) + + +def chunk_text(text: str, chunk_size: int = 1000) -> List[str]: + """ + Split text into chunks of approximately chunk_size characters. + Tries to split at sentence boundaries when possible. + """ + if not text: + return [] + + chunks = [] + current_chunk = "" + + # Split text into sentences (simple split by period) + sentences = text.split(". ") + + for sentence in sentences: + if len(current_chunk) + len(sentence) <= chunk_size: + current_chunk += sentence + ". " + else: + if current_chunk: + chunks.append(current_chunk.strip()) + current_chunk = sentence + ". " + + if current_chunk: + chunks.append(current_chunk.strip()) + + return chunks + + +def dc_function(request: dict) -> dict: + logger.info("Inside DC Function") + logger.info(request) + + items = request["input"] + output_chunks = [] + current_seq_no = 1 # Start sequence number from 1 + + for item in items: + # Item is DocElement as dict + logger.info(f"Processing item: {item}") + + text = item.get("text", "") + metadata = item.get("metadata", {}) + + # Create chunks from the text + text_chunks = chunk_text(text, chunk_size=100) # Using a larger chunk size + + # Create chunk dictionaries for each text chunk + for chunk_content in text_chunks: + chunk_dict = { + "text": chunk_content, + "metadata": metadata, + "seq_no": current_seq_no, + "chunk_type": "text", + "chunk_id": str(uuid4()), + "tag_metadata": {}, + "citations": {}, + "source_record": item, + } + output_chunks.append(chunk_dict) + current_seq_no += 1 # Increment sequence number for next chunk + + logger.info("Completed chunking") + response = { + "output": output_chunks, + "status": {"status_type": "success", "status_message": "Chunking completed"}, + } + logger.info(response) + return response + + +# Test the function +if __name__ == "__main__": + # Configure logging + logging.basicConfig(level=logging.INFO) + + # Create test data with two DocElements + test_request = { + "input": [ + { + "text": ( + """This is the first sentence of the first document, which is + intentionally made longer to test chunking. """ + """Here is the second sentence of the first document, which is also + quite long and should ensure that the chunking function splits + this text into two chunks when the chunk size is set to 100.""" + ), + "metadata": {"source": "test1", "type": "document"}, + }, + { + "text": ( + """This is the first sentence of the second document, and it is + also extended to be longer than usual for testing purposes. """ + """The second sentence of the second document is similarly lengthy, + so that the chunking function will again create two chunks for + this document.""" + ), + "metadata": {"source": "test2", "type": "document"}, + }, + ] + } + + # Run the function + result = dc_function(test_request) + + # Print the results in a more readable format + print("\nChunking Results:") + print("----------------") + for chunk in result["output"]: + print(f"\nChunk #{chunk['seq_no']}:") + print(f"Text: {chunk['text'][:100]}...") # Print first 100 chars of each chunk + print(f"Source: {chunk['metadata']['source']}") + print(f"Chunk ID: {chunk['chunk_id']}") diff --git a/src/datacustomcode/templates/requirements-dev.txt b/src/datacustomcode/templates/function/requirements-dev.txt similarity index 100% rename from src/datacustomcode/templates/requirements-dev.txt rename to src/datacustomcode/templates/function/requirements-dev.txt diff --git a/src/datacustomcode/templates/requirements.txt b/src/datacustomcode/templates/function/requirements.txt similarity index 100% rename from src/datacustomcode/templates/requirements.txt rename to src/datacustomcode/templates/function/requirements.txt diff --git a/src/datacustomcode/templates/script/.devcontainer/devcontainer.json b/src/datacustomcode/templates/script/.devcontainer/devcontainer.json new file mode 100644 index 0000000..3bdca00 --- /dev/null +++ b/src/datacustomcode/templates/script/.devcontainer/devcontainer.json @@ -0,0 +1,10 @@ +{ + "name": "Existing Dockerfile", + "build": { + "context": "..", + "dockerfile": "../Dockerfile" + }, + "features": { + "ghcr.io/devcontainers/features/git:1": {}, + } +} diff --git a/src/datacustomcode/templates/script/Dockerfile b/src/datacustomcode/templates/script/Dockerfile new file mode 100644 index 0000000..23eba84 --- /dev/null +++ b/src/datacustomcode/templates/script/Dockerfile @@ -0,0 +1,18 @@ +FROM public.ecr.aws/emr-on-eks/spark/emr-7.3.0:latest + +USER root + +# install from dev requirements.txt +COPY requirements-dev.txt ./requirements-dev.txt +RUN pip3.11 install --no-cache-dir -r requirements-dev.txt + +# Install from requirements.txt: +COPY requirements.txt ./requirements.txt +RUN pip3.11 install --no-cache-dir -r requirements.txt + +# Create workspace directory +RUN mkdir /workspace + +# Set user and working directory +USER hadoop:hadoop +WORKDIR /workspace diff --git a/src/datacustomcode/templates/script/Dockerfile.dependencies b/src/datacustomcode/templates/script/Dockerfile.dependencies new file mode 100644 index 0000000..360499c --- /dev/null +++ b/src/datacustomcode/templates/script/Dockerfile.dependencies @@ -0,0 +1,11 @@ +FROM public.ecr.aws/emr-on-eks/spark/emr-7.3.0:latest + +USER root + +RUN pip3.11 install venv-pack + +# Create workspace directory +RUN mkdir /workspace +WORKDIR /workspace + +CMD ["./build_native_dependencies.sh"] diff --git a/src/datacustomcode/templates/script/README.md b/src/datacustomcode/templates/script/README.md new file mode 100644 index 0000000..e69de29 diff --git a/src/datacustomcode/templates/script/account.ipynb b/src/datacustomcode/templates/script/account.ipynb new file mode 100644 index 0000000..75d8bb3 --- /dev/null +++ b/src/datacustomcode/templates/script/account.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0", + "metadata": {}, + "outputs": [], + "source": [ + "from datacustomcode.client import Client\n", + "from datacustomcode.io.writer.base import WriteMode\n", + "from pyspark.sql.functions import col, upper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1", + "metadata": {}, + "outputs": [], + "source": [ + "client = Client()\n", + "\n", + "df = client.read_dlo(\"Account_Home__dll\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": {}, + "outputs": [], + "source": [ + "# Perform transformations on the DataFrame\n", + "df_upper1 = df.withColumn(\"Description__c\", upper(col(\"Description__c\")))\n", + "\n", + "# Drop specific columns related to relationships\n", + "df_upper1 = df_upper1.drop(\"KQ_ParentId__c\")\n", + "df_upper1 = df_upper1.drop(\"KQ_Id__c\")\n", + "\n", + "df_upper1.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3", + "metadata": {}, + "outputs": [], + "source": [ + "# Save the transformed DataFrame\n", + "dlo_name = \"Account_Home_copy__dll\"\n", + "client.write_to_dlo(dlo_name, df_upper1, write_mode=WriteMode.APPEND)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/datacustomcode/templates/script/build_native_dependencies.sh b/src/datacustomcode/templates/script/build_native_dependencies.sh new file mode 100755 index 0000000..2fbdce0 --- /dev/null +++ b/src/datacustomcode/templates/script/build_native_dependencies.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +# Description: build native dependencies + +python3.11 -m venv --copies .venv +source .venv/bin/activate +pip install -r requirements.txt +venv-pack -o native_dependencies.tar.gz -f diff --git a/src/datacustomcode/templates/script/examples/employee_hierarchy/employee_data.csv b/src/datacustomcode/templates/script/examples/employee_hierarchy/employee_data.csv new file mode 100644 index 0000000..ec38992 --- /dev/null +++ b/src/datacustomcode/templates/script/examples/employee_hierarchy/employee_data.csv @@ -0,0 +1,13 @@ +id,name,position,manager_id +1,Alice,CEO, +2,Bob,VP Engineering,1 +3,Charlie,VP Sales,1 +4,David,Engineering Lead,2 +5,Eve,Engineering Lead,2 +6,Frank,Sales Manager,3 +7,Grace,Software Eng.,4 +8,Hannah,Software Eng.,4 +9,Ian,Software Eng.,5 +10,Jack,Sales Rep,6 +11,Kelly,Sales Rep,6 +12,Leo,Intern,7 diff --git a/src/datacustomcode/templates/script/examples/employee_hierarchy/entrypoint.py b/src/datacustomcode/templates/script/examples/employee_hierarchy/entrypoint.py new file mode 100644 index 0000000..e1d56c4 --- /dev/null +++ b/src/datacustomcode/templates/script/examples/employee_hierarchy/entrypoint.py @@ -0,0 +1,78 @@ +from pyspark.sql.functions import ( + col, + concat_ws, + lit, +) + +from datacustomcode.client import Client +from datacustomcode.io.writer.base import WriteMode + + +def main(): + client = Client() + + employees = client.read_dlo("Employee__dll").persist() + employees = employees.select("id__c", "manager_id__c", "name__c", "position__c") + employees.show() + employees_with_manager = ( + employees.alias("e") + .join( + employees.alias("m"), + col("e.manager_id__c").cast("string") == col("m.id__c").cast("string"), + "left", + ) + .select( + col("e.id__c"), + col("e.name__c"), + col("e.position__c"), + col("e.manager_id__c"), + col("m.name__c").alias("manager_name__c"), + ) + .persist() + ) + + hierarchy_df = ( + employees_with_manager.filter(col("manager_id__c").isNull()) + .withColumn("hierarchy_level__c", lit(1)) + .withColumn("management_chain__c", col("name__c")) + .persist() + ) + + current_level = 1 + + while True: + ewm = employees_with_manager.alias("ewm") + hdf = hierarchy_df.filter(col("hierarchy_level__c") == current_level).alias( + "hdf" + ) + + next_level_df = ewm.join( + hdf, + col("ewm.manager_id__c").cast("string") == col("hdf.id__c").cast("string"), + "inner", + ).select( + col("ewm.id__c"), + col("ewm.name__c"), + col("ewm.position__c"), + col("ewm.manager_id__c"), + col("ewm.manager_name__c"), + (col("hdf.hierarchy_level__c") + 1).alias("hierarchy_level__c"), + concat_ws(" | ", col("hdf.management_chain__c"), col("ewm.name__c")).alias( + "management_chain__c" + ), + ) + + if next_level_df.isEmpty(): + break + + hierarchy_df = hierarchy_df.union(next_level_df).persist() + current_level += 1 + + hierarchy_df = hierarchy_df.orderBy("hierarchy_level__c", "manager_id__c", "id__c") + + dlo_name = "Employee_Hierarchy__dll" + client.write_to_dlo(dlo_name, hierarchy_df, WriteMode.APPEND) + + +if __name__ == "__main__": + main() diff --git a/src/datacustomcode/templates/script/jupyterlab.sh b/src/datacustomcode/templates/script/jupyterlab.sh new file mode 100755 index 0000000..e8445fc --- /dev/null +++ b/src/datacustomcode/templates/script/jupyterlab.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Description: This script is used to start/stop the jupyter notebook in a docker container + +# Function to open browser based on OS +open_browser() { + local url=$1 + case "$(uname -s)" in + Darwin*) # macOS + open "$url" + ;; + Linux*) # Linux + if command -v xdg-open &> /dev/null; then + xdg-open "$url" + elif command -v gnome-open &> /dev/null; then + gnome-open "$url" + else + echo "Could not detect the web browser to use" + return 1 + fi + ;; + CYGWIN*|MINGW32*|MSYS*|MINGW*) # Windows + start "$url" + ;; + *) + echo "Unknown operating system" + return 1 + ;; + esac +} + +# Function to check if Docker is installed and running +check_docker() { + if ! command -v docker &> /dev/null; then + echo "Docker is not installed" + exit 1 + fi + echo "Docker is installed" + docker --version + + if ! docker info &> /dev/null; then + echo "Docker daemon is not running" + exit 1 + fi + echo "Docker daemon is running" +} + +# Function to start Jupyter server +start_jupyter() { + echo "Building the docker image" + docker build -t datacloud-customcode . + + echo "Running the docker container" + docker run -d --rm -p 8888:8888 \ + -v $(pwd):/workspace \ + --name jupyter-server \ + datacloud-customcode jupyter lab \ + --ip=0.0.0.0 \ + --port=8888 \ + --no-browser \ + --allow-root \ + --NotebookApp.token='' \ + --NotebookApp.password='' \ + --notebook-dir=/workspace + + sleep 3 # Wait for server to start + open_browser "http://localhost:8888" +} + +# Function to stop Jupyter server +stop_jupyter() { + echo "Stopping Jupyter server container..." + if docker ps -q --filter "name=jupyter-server" | grep -q .; then + docker stop jupyter-server + echo "Jupyter server stopped successfully" + else + echo "No Jupyter server container running" + fi +} + +# Main script logic +case "$1" in + "start") + check_docker + start_jupyter + ;; + "stop") + check_docker + stop_jupyter + ;; + *) + echo "Usage: $0 {start|stop}" + echo " start - Start Jupyter server" + echo " stop - Stop Jupyter server" + exit 1 + ;; +esac diff --git a/src/datacustomcode/templates/payload/config.json b/src/datacustomcode/templates/script/payload/config.json similarity index 100% rename from src/datacustomcode/templates/payload/config.json rename to src/datacustomcode/templates/script/payload/config.json diff --git a/src/datacustomcode/templates/payload/entrypoint.py b/src/datacustomcode/templates/script/payload/entrypoint.py similarity index 100% rename from src/datacustomcode/templates/payload/entrypoint.py rename to src/datacustomcode/templates/script/payload/entrypoint.py diff --git a/src/datacustomcode/templates/script/requirements-dev.txt b/src/datacustomcode/templates/script/requirements-dev.txt new file mode 100644 index 0000000..7b950aa --- /dev/null +++ b/src/datacustomcode/templates/script/requirements-dev.txt @@ -0,0 +1,10 @@ +# Required packages for the project - Do not modify +salesforce-cdp-connector>=1.0.16 +pyspark==3.5.1 +pandas +numpy +pydantic +jupyterlab +ipywidgets +tqdm +salesforce-data-customcode diff --git a/src/datacustomcode/templates/script/requirements.txt b/src/datacustomcode/templates/script/requirements.txt new file mode 100644 index 0000000..b4fc884 --- /dev/null +++ b/src/datacustomcode/templates/script/requirements.txt @@ -0,0 +1 @@ +# Packages required for the custom code diff --git a/tests/test_cli.py b/tests/test_cli.py index 75a1e04..b270db7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -8,10 +8,14 @@ class TestInit: - @patch("datacustomcode.template.copy_template") + @patch("datacustomcode.template.copy_script_template") + @patch("datacustomcode.scan.update_config") @patch("datacustomcode.scan.dc_config_json_from_file") + @patch("datacustomcode.scan.write_sdk_config") @patch("builtins.open", new_callable=mock_open) - def test_init_command(self, mock_file, mock_scan, mock_copy): + def test_init_command( + self, mock_file, mock_write_sdk, mock_scan, mock_update, mock_copy + ): """Test init command.""" mock_scan.return_value = { "sdkVersion": "1.0.0", @@ -22,17 +26,31 @@ def test_init_command(self, mock_file, mock_scan, mock_copy): "write": {"dlo": ["output_dlo"]}, }, } + mock_update.return_value = { + "sdkVersion": "1.0.0", + "entryPoint": "entrypoint.py", + "dataspace": "default", + "permissions": { + "read": {"dlo": ["input_dlo"]}, + "write": {"dlo": ["output_dlo"]}, + }, + } runner = CliRunner() with runner.isolated_filesystem(): # Create test directory structure os.makedirs(os.path.join("test_dir", "payload"), exist_ok=True) - result = runner.invoke(init, ["test_dir"]) + result = runner.invoke(init, ["test_dir", "--code-type", "script"]) assert result.exit_code == 0 mock_copy.assert_called_once_with("test_dir") + # Verify SDK config was written + mock_write_sdk.assert_called_once_with("test_dir", {"type": "script"}) mock_scan.assert_called_once_with( + os.path.join("test_dir", "payload", "entrypoint.py"), "script" + ) + mock_update.assert_called_once_with( os.path.join("test_dir", "payload", "entrypoint.py") ) @@ -45,8 +63,9 @@ def test_init_command(self, mock_file, mock_scan, mock_copy): written_content = "".join( call.args[0] for call in mock_file().write.call_args_list ) - expected_content = json.dumps(mock_scan.return_value, indent=2) - assert written_content == expected_content + # The last write should be the updated config + expected_content = json.dumps(mock_update.return_value, indent=2) + assert expected_content in written_content class TestDeploy: diff --git a/tests/test_scan.py b/tests/test_scan.py index 2703173..d3ae2ea 100644 --- a/tests/test_scan.py +++ b/tests/test_scan.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import os import tempfile import textwrap @@ -8,12 +9,16 @@ import pytest from datacustomcode.scan import ( + SDK_CONFIG_DIR, DataAccessLayerCalls, dc_config_json_from_file, scan_file, scan_file_for_imports, + update_config, write_requirements_file, + write_sdk_config, ) +from datacustomcode.version import get_version def create_test_script(content: str) -> str: @@ -24,6 +29,21 @@ def create_test_script(content: str) -> str: return temp_path +def create_sdk_config(base_directory: str, package_type: str = "script") -> str: + """Create SDK config file for testing. + + Args: + base_directory: The base directory where .datacustomcode should be created + package_type: The package type ("script" or "function") + + Returns: + Path to the created SDK config file + """ + sdk_config = {"type": package_type} + write_sdk_config(base_directory, sdk_config) + return os.path.join(base_directory, SDK_CONFIG_DIR, "config.json") + + class TestClientMethodVisitor: def test_variable_tracking(self): """Test that the visitor can track variable assignments.""" @@ -282,18 +302,6 @@ def test_multiple_writes(self): class TestDcConfigJson: - @patch( - "datacustomcode.scan.DATA_TRANSFORM_CONFIG_TEMPLATE", - { - "sdkVersion": "1.2.3", - "entryPoint": "", - "dataspace": "default", - "permissions": { - "read": {}, - "write": {}, - }, - }, - ) def test_dlo_to_dlo_config(self): """Test generating config JSON for DLO to DLO operations.""" content = textwrap.dedent( @@ -310,28 +318,27 @@ def test_dlo_to_dlo_config(self): """ ) temp_path = create_test_script(content) + file_dir = os.path.dirname(temp_path) + # Create SDK config in the same directory as the script + sdk_config_path = create_sdk_config(file_dir, "script") try: - result = dc_config_json_from_file(temp_path) + result = dc_config_json_from_file(temp_path, "script") assert result["entryPoint"] == os.path.basename(temp_path) assert result["dataspace"] == "default" - assert result["sdkVersion"] == "1.2.3" # From mocked version + assert result["sdkVersion"] == get_version() + file_dir = os.path.dirname(temp_path) + config_path = os.path.join(file_dir, "config.json") + with open(config_path, "w") as f: + json.dump(result, f) + result = update_config(temp_path) assert result["permissions"]["read"]["dlo"] == ["input_dlo"] assert result["permissions"]["write"]["dlo"] == ["output_dlo"] finally: os.remove(temp_path) + if os.path.exists(sdk_config_path): + os.remove(sdk_config_path) + os.rmdir(os.path.dirname(sdk_config_path)) - @patch( - "datacustomcode.scan.DATA_TRANSFORM_CONFIG_TEMPLATE", - { - "sdkVersion": "1.2.3", - "entryPoint": "", - "dataspace": "default", - "permissions": { - "read": {}, - "write": {}, - }, - }, - ) def test_dmo_to_dmo_config(self): """Test generating config JSON for DMO to DMO operations.""" content = textwrap.dedent( @@ -348,15 +355,22 @@ def test_dmo_to_dmo_config(self): """ ) temp_path = create_test_script(content) + file_dir = os.path.dirname(temp_path) + # Create SDK config in the same directory as the script + sdk_config_path = create_sdk_config(file_dir, "script") try: - config = dc_config_json_from_file(temp_path) + config = dc_config_json_from_file(temp_path, "script") assert config["entryPoint"] == os.path.basename(temp_path) assert config["dataspace"] == "default" - assert config["sdkVersion"] == "1.2.3" # From mocked version + assert config["sdkVersion"] == get_version() + config = update_config(temp_path) assert config["permissions"]["read"]["dmo"] == ["input_dmo"] assert config["permissions"]["write"]["dmo"] == ["output_dmo"] finally: os.remove(temp_path) + if os.path.exists(sdk_config_path): + os.remove(sdk_config_path) + os.rmdir(os.path.dirname(sdk_config_path)) @patch( "datacustomcode.scan.DATA_TRANSFORM_CONFIG_TEMPLATE", @@ -388,10 +402,14 @@ def test_preserves_existing_dataspace(self): config_path = os.path.join(file_dir, "config.json") try: + # Create SDK config + sdk_config_path = create_sdk_config(file_dir, "script") # Create an existing config.json with a custom dataspace + # (without type field) existing_config = { "sdkVersion": "1.0.0", "entryPoint": "test.py", + "type": "script", "dataspace": "my_custom_dataspace", "permissions": { "read": {"dlo": ["old_dlo"]}, @@ -402,7 +420,7 @@ def test_preserves_existing_dataspace(self): json.dump(existing_config, f) # Generate new config - should preserve dataspace - result = dc_config_json_from_file(temp_path) + result = update_config(temp_path) assert result["dataspace"] == "my_custom_dataspace" assert result["permissions"]["read"]["dlo"] == ["input_dlo"] assert result["permissions"]["write"]["dlo"] == ["output_dlo"] @@ -410,23 +428,13 @@ def test_preserves_existing_dataspace(self): os.remove(temp_path) if os.path.exists(config_path): os.remove(config_path) + if os.path.exists(sdk_config_path): + os.remove(sdk_config_path) + os.rmdir(os.path.dirname(sdk_config_path)) - @patch( - "datacustomcode.scan.DATA_TRANSFORM_CONFIG_TEMPLATE", - { - "sdkVersion": "1.2.3", - "entryPoint": "", - "dataspace": "", - "permissions": { - "read": {}, - "write": {}, - }, - }, - ) - def test_uses_default_for_empty_dataspace(self, caplog): + def test_uses_default_for_empty_dataspace(self): """Test that empty dataspace value uses default and logs warning.""" import json - import logging content = textwrap.dedent( """ @@ -442,7 +450,9 @@ def test_uses_default_for_empty_dataspace(self, caplog): config_path = os.path.join(file_dir, "config.json") try: - # Create an existing config.json with empty dataspace + # Create SDK config + sdk_config_path = create_sdk_config(file_dir, "script") + # Create an existing config.json with empty dataspace (without type field) existing_config = { "sdkVersion": "1.0.0", "entryPoint": "test.py", @@ -456,37 +466,20 @@ def test_uses_default_for_empty_dataspace(self, caplog): json.dump(existing_config, f) # Should use "default" for empty dataspace (not raise error) - with caplog.at_level(logging.WARNING): - result = dc_config_json_from_file(temp_path) + result = update_config(temp_path) assert result["dataspace"] == "default" assert result["permissions"]["read"]["dlo"] == ["input_dlo"] assert result["permissions"]["write"]["dlo"] == ["output_dlo"] - # Verify that a warning was logged - assert len(caplog.records) > 0 - assert any( - "dataspace" in record.message.lower() - and "empty" in record.message.lower() - for record in caplog.records - ) finally: os.remove(temp_path) if os.path.exists(config_path): os.remove(config_path) + if os.path.exists(sdk_config_path): + os.remove(sdk_config_path) + os.rmdir(os.path.dirname(sdk_config_path)) - @patch( - "datacustomcode.scan.DATA_TRANSFORM_CONFIG_TEMPLATE", - { - "sdkVersion": "1.2.3", - "entryPoint": "", - "dataspace": "", - "permissions": { - "read": {}, - "write": {}, - }, - }, - ) def test_uses_default_dataspace_when_no_config(self): """Test missing config.json uses default dataspace.""" content = textwrap.dedent( @@ -502,25 +495,11 @@ def test_uses_default_dataspace_when_no_config(self): try: # No existing config.json - should use "default" dataspace - result = dc_config_json_from_file(temp_path) + result = dc_config_json_from_file(temp_path, "script") assert result["dataspace"] == "default" - assert result["permissions"]["read"]["dlo"] == ["input_dlo"] - assert result["permissions"]["write"]["dlo"] == ["output_dlo"] finally: os.remove(temp_path) - @patch( - "datacustomcode.scan.DATA_TRANSFORM_CONFIG_TEMPLATE", - { - "sdkVersion": "1.2.3", - "entryPoint": "", - "dataspace": "", - "permissions": { - "read": {}, - "write": {}, - }, - }, - ) def test_rejects_missing_dataspace(self): """Test that config.json missing dataspace field raises ValueError.""" import json @@ -539,7 +518,10 @@ def test_rejects_missing_dataspace(self): config_path = os.path.join(file_dir, "config.json") try: + # Create SDK config + sdk_config_path = create_sdk_config(file_dir, "script") # Create an existing config.json without dataspace field + # (without type field) existing_config = { "sdkVersion": "1.0.0", "entryPoint": "test.py", @@ -553,13 +535,18 @@ def test_rejects_missing_dataspace(self): # Should raise ValueError when dataspace field is missing with pytest.raises( - ValueError, match="dataspace must be defined in.*config.json" + ValueError, + match="dataspace must be defined. Please add a 'dataspace' field to " + "the config.json file.", ): - dc_config_json_from_file(temp_path) + update_config(temp_path) finally: os.remove(temp_path) if os.path.exists(config_path): os.remove(config_path) + if os.path.exists(sdk_config_path): + os.remove(sdk_config_path) + os.rmdir(os.path.dirname(sdk_config_path)) def test_raises_error_on_invalid_json(self): """Test that invalid JSON in config.json raises an error.""" @@ -584,7 +571,7 @@ def test_raises_error_on_invalid_json(self): # Should raise ValueError for invalid JSON with pytest.raises(ValueError, match="Failed to parse JSON"): - dc_config_json_from_file(temp_path) + update_config(temp_path) finally: os.remove(temp_path) if os.path.exists(config_path): @@ -660,8 +647,11 @@ def process_customer_data(): result = scan_file(temp_path) assert "customer_data_raw" in result.read_dlo assert "customer_data_enriched" in result.write_to_dlo - - config = dc_config_json_from_file(temp_path) + result = dc_config_json_from_file(temp_path, "script") + config_path = os.path.join(os.path.dirname(temp_path), "config.json") + with open(config_path, "w") as f: + json.dump(result, f) + config = update_config(temp_path) assert config["permissions"]["read"]["dlo"] == ["customer_data_raw"] assert config["permissions"]["write"]["dlo"] == ["customer_data_enriched"] finally: diff --git a/tests/test_template.py b/tests/test_template.py index 1d2339e..7475a97 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -6,7 +6,7 @@ import pytest -from datacustomcode.template import copy_template, template_dir +from datacustomcode.template import copy_script_template, script_template_dir class TestTemplate: @@ -18,17 +18,17 @@ def temp_dir(self): def test_template_dir_exists(self): """Test that the template directory exists.""" - assert os.path.isdir(template_dir) + assert os.path.isdir(script_template_dir) # Verify there are actual files in the template dir - assert len(os.listdir(template_dir)) > 0 + assert len(os.listdir(script_template_dir)) > 0 def test_copy_template_basic(self, temp_dir): """Test basic functionality of copy_template.""" # Call the function to copy templates to temporary directory - copy_template(temp_dir) + copy_script_template(temp_dir) # Get the list of files in both directories - template_items = os.listdir(template_dir) + template_items = os.listdir(script_template_dir) copied_items = os.listdir(temp_dir) # Verify all template items were copied @@ -36,7 +36,7 @@ def test_copy_template_basic(self, temp_dir): assert item in copied_items # Check if the copied item matches the original - source_path = os.path.join(template_dir, item) + source_path = os.path.join(script_template_dir, item) dest_path = os.path.join(temp_dir, item) if os.path.isdir(source_path): @@ -82,13 +82,13 @@ def test_copy_template_creates_nonexistent_dir(self, temp_dir): assert not os.path.exists(nonexistent_dir) # Copy templates - copy_template(nonexistent_dir) + copy_script_template(nonexistent_dir) # Verify directory was created assert os.path.isdir(nonexistent_dir) # Verify contents match the template directory - template_items = set(os.listdir(template_dir)) + template_items = set(os.listdir(script_template_dir)) copied_items = set(os.listdir(nonexistent_dir)) assert template_items == copied_items @@ -100,7 +100,7 @@ def test_copy_template_with_existing_content(self, temp_dir): f.write("This is an existing file") # Copy templates - copy_template(temp_dir) + copy_script_template(temp_dir) # Verify the existing file is still there assert os.path.exists(test_file) @@ -111,6 +111,6 @@ def test_copy_template_with_existing_content(self, temp_dir): assert content == "This is an existing file" # Verify all template items were also copied - template_items = os.listdir(template_dir) + template_items = os.listdir(script_template_dir) for item in template_items: assert os.path.exists(os.path.join(temp_dir, item))