diff --git a/build.py b/build.py index ab118f76..0e7c919f 100644 --- a/build.py +++ b/build.py @@ -1,4 +1,5 @@ from collections import defaultdict +import json import os.path import shutil import subprocess @@ -7,25 +8,41 @@ from jinja2 import Environment, select_autoescape, FileSystemLoader from pipeline.translator import PythonBuilder -from pipeline.utils import clone_sources, SchemaLoader +from pipeline.utils import clone_sources, SchemaLoader, InstanceLoader -print("***************************************") +print("*********************************************************") print(f"Triggering the generation of Python package for openMINDS") -print("***************************************") +print("*********************************************************") + +# Step 0 - read code for additional methods +additional_methods = {} +with open("pipeline/src/additional_methods/by_name.py.txt") as fp: + code = fp.read() +additional_methods["by_name"] = code # Step 1 - clone central repository in main branch to get the latest sources clone_sources() schema_loader = SchemaLoader() +instance_loader = InstanceLoader() if os.path.exists("target"): shutil.rmtree("target") +# Step 2 - load instances +instances = {} +for version in instance_loader.get_instance_versions(): + instances[version] = defaultdict(list) + for instance_path in instance_loader.find_instances(version): + with open(instance_path) as fp: + instance_data = json.load(fp) + instances[version][instance_data["@type"]].append(instance_data) + python_modules = defaultdict(list) for schema_version in schema_loader.get_schema_versions(): - # Step 2 - find all involved schemas for the current version + # Step 3 - find all involved schemas for the current version schemas_file_paths = schema_loader.find_schemas(schema_version) - # Step 3a - figure out which schemas are embedded and which are linked + # Step 4a - figure out which schemas are embedded and which are linked embedded = set() linked = set() for schema_file_path in schemas_file_paths: @@ -42,17 +59,18 @@ for schema_identifier in conflicts: linked.remove(schema_identifier) - # Step 3b - translate and build each openMINDS schema as a Python class + # Step 4b - translate and build each openMINDS schema as a Python class for schema_file_path in schemas_file_paths: module_path, class_name = PythonBuilder( - schema_file_path, schema_loader.schemas_sources + schema_file_path, schema_loader.schemas_sources, instances=instances.get(schema_version, None), + additional_methods=additional_methods ).build(embedded=embedded) parts = module_path.split(".") parent_path = ".".join(parts[:-1]) python_modules[parent_path].append((parts[-1], class_name)) -# Step 4 - create additional files, e.g. __init__.py +# Step 5 - create additional files, e.g. __init__.py openminds_modules = defaultdict(set) for path, classes in python_modules.items(): dir_path = ["target", "openminds"] + path.split(".") @@ -96,5 +114,5 @@ shutil.copy("pipeline/src/README.md", "target/README.md") shutil.copy("./LICENSE", "target/LICENSE") -# Step 5 - run formatter +# Step 6 - run formatter subprocess.call([sys.executable, "-m", "black", "--quiet", "target"]) diff --git a/pipeline/src/additional_methods/by_name.py.txt b/pipeline/src/additional_methods/by_name.py.txt new file mode 100644 index 00000000..fb96249a --- /dev/null +++ b/pipeline/src/additional_methods/by_name.py.txt @@ -0,0 +1,14 @@ + @classmethod + def instances(cls): + return [value for value in cls.__dict__.values() if isinstance(value, cls)] + + @classmethod + def by_name(cls, name): + if cls._instance_lookup is None: + cls._instance_lookup = {} + for instance in cls.instances(): + cls._instance_lookup[instance.name] = instance + if instance.synonyms: + for synonym in instance.synonyms: + cls._instance_lookup[synonym] = instance + return cls._instance_lookup[name] diff --git a/pipeline/src/base.py b/pipeline/src/base.py index 8b579caf..55485c0d 100644 --- a/pipeline/src/base.py +++ b/pipeline/src/base.py @@ -170,6 +170,7 @@ class LinkedMetadata(Node): """ A Python representation of a metadata node that should have a unique identifier. """ + _instance_lookup = None def __init__(self, id=None, **properties): self.id = id # todo: check this is a URI diff --git a/pipeline/src/module_template.py.txt b/pipeline/src/module_template.py.txt index 8c9cd53b..a6845479 100644 --- a/pipeline/src/module_template.py.txt +++ b/pipeline/src/module_template.py.txt @@ -47,3 +47,10 @@ class {{ class_name }}({{ base_class }}): return super().__init__({%- if base_class == "LinkedMetadata" %}id=id, {%- endif -%}{%- for property in properties -%}{{property.name}}={{property.name}}, {%- endfor -%}) {{ additional_methods }} + +{% for instance_name, instance in instances.items() %} +{{ class_name }}.{{ instance_name }} = {{ class_name }}( + {% for key, value in instance.items() -%} + {% if value is string %}{{key}}="{{value}}",{% else %}{{key}}={{value}},{% endif %} + {% endfor -%} +){% endfor %} \ No newline at end of file diff --git a/pipeline/src/registry.py b/pipeline/src/registry.py index 0cbf6952..e736fc8b 100644 --- a/pipeline/src/registry.py +++ b/pipeline/src/registry.py @@ -5,24 +5,27 @@ """ from __future__ import annotations +from collections import defaultdict from typing import TYPE_CHECKING, Union, List, Optional if TYPE_CHECKING: from .base import ContainsMetadata -registry: dict = {"names": {}, "types": {}} +registry: dict = {"names": {}, "types": defaultdict(dict)} def register_class(target_class: ContainsMetadata): """Add a class to the registry""" if "openminds" in target_class.__module__: parts = target_class.__module__.split(".") + assert parts[0] == "openminds" + version = parts[1] name = ".".join(parts[0:3] + [target_class.__name__]) # e.g. openminds.latest.core.Dataset if hasattr(target_class, "type_"): registry["names"][name] = target_class type_ = target_class.type_ - registry["types"][type_] = target_class + registry["types"][version][type_] = target_class def lookup(class_name: str) -> ContainsMetadata: @@ -30,13 +33,13 @@ def lookup(class_name: str) -> ContainsMetadata: return registry["names"][class_name] -def lookup_type(class_type: str) -> ContainsMetadata: +def lookup_type(class_type: str, version: str = "latest") -> ContainsMetadata: """Return the class whose global type identifier (a URI) is given.""" if isinstance(class_type, str): - if class_type in registry["types"]: - return registry["types"][class_type] + if class_type in registry["types"][version]: + return registry["types"][version][class_type] else: - raise ValueError(f"Type '{class_type}' was not found in the registry.") + raise ValueError(f"Type '{class_type}' was not found in the registry for version {version}.") else: raise TypeError("class type must be a string") diff --git a/pipeline/translator.py b/pipeline/translator.py index 828e539a..3ad49bcf 100644 --- a/pipeline/translator.py +++ b/pipeline/translator.py @@ -6,17 +6,40 @@ from jinja2 import Environment, select_autoescape, FileSystemLoader +number_names = { + "0": "zero", + "1": "one", + "2": "two", + "3": "three", + "4": "four", + "5": "five", + "6": "six", + "7": "seven", + "8": "eight", + "9": "nine" +} + + def generate_python_name(json_name, allow_multiple=False): - python_name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", json_name) + python_name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", json_name.strip()) python_name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", python_name).lower() - python_name = python_name.replace("-", "_") + replacements = [ + ("-", "_"), (".", "_"), ("+", "plus"), ("#", "sharp"), (",", "comma"), ("(", ""), (")", "") + ] + for before, after in replacements: + python_name = python_name.replace(before, after) + if python_name[0] in number_names: # Python variables can't start with a number + python_name = number_names[python_name[0]] + python_name[1:] + if not python_name.isidentifier(): + raise NameError(f"Cannot generate a valid Python name from '{json_name}'") return python_name class PythonBuilder(object): """docstring""" - def __init__(self, schema_file_path: str, root_path: str): + def __init__(self, schema_file_path: str, root_path: str, instances: Optional[dict] = None, + additional_methods: Optional[dict] = None): self.template_name = "src/module_template.py.txt" self.env = Environment( loader=FileSystemLoader(os.path.dirname(os.path.realpath(__file__))), autoescape=select_autoescape() @@ -30,6 +53,8 @@ def __init__(self, schema_file_path: str, root_path: str): ] with open(schema_file_path, "r") as schema_f: self._schema_payload = json.load(schema_f) + self.instances = instances or {} + self.additional_methods = additional_methods @property def _version_module(self): @@ -90,10 +115,34 @@ def get_type(property): else: raise NotImplementedError - if self._schema_payload["_type"] in embedded: + openminds_type = self._schema_payload["_type"] + if openminds_type in embedded: base_class = "EmbeddedMetadata" else: base_class = "LinkedMetadata" + + def filter_value(value): + if isinstance(value, str): + return value.replace('"', "'").replace("\n", " ") + return value + + def filter_instance(instance): + filtered_instance = { + k: filter_value(v) + for k, v in instance.items() + if k[0] != "@" and k[:4] != "http" and v is not None + } + filtered_instance["id"] = instance["@id"] + return filtered_instance + + instances = { + generate_python_name(instance["@id"].split("/")[-1]) : filter_instance(instance) + for instance in self.instances.get(openminds_type, []) + } + instances = { # sort by key + name: instances[name] for name in sorted(instances) + } + properties = [] for iri, property in self._schema_payload["properties"].items(): allow_multiple = property.get("type", "") == "array" @@ -101,11 +150,12 @@ def get_type(property): property_name = property['namePlural'] else: property_name = property['name'] + pythononic_name = generate_python_name(property_name) properties.append( { - "name": generate_python_name(property_name), + "name": pythononic_name, "type": get_type(property), # compress using JSON-LD context - "iri": property['name'], # assumes IRI uses standard @vocab + "iri": property["name"], # assumes IRI uses standard @vocab "allow_multiple": allow_multiple, "required": iri in self._schema_payload.get("required", []), "description": property.get("description", "no description available"), @@ -118,16 +168,24 @@ def get_type(property): } ) # unused in property: "nameForReverseLink" + for instance in instances.values(): + if property["name"] in instance: + instance[pythononic_name] = instance.pop(property['name']) self.context = { "docstring": self._schema_payload.get("description", ""), "base_class": base_class, "preamble": "", # default value, may be updated below "class_name": self._schema_payload["name"], - "openminds_type": self._schema_payload["_type"], + "openminds_type": openminds_type, "schema_version": self.version, "properties": properties, "additional_methods": "", + "instances": instances } + + if len(instances) > 0: + self.context["additional_methods"] = self.additional_methods["by_name"] + import_map = { "date": "from datetime import date", "datetime": "from datetime import datetime", diff --git a/pipeline/utils.py b/pipeline/utils.py index b471566d..be8b8a45 100644 --- a/pipeline/utils.py +++ b/pipeline/utils.py @@ -5,22 +5,49 @@ from git import Repo, GitCommandError -source_url = "https://github.com/openMetadataInitiative/openMINDS.git" - def clone_sources(): - if os.path.exists("sources"): - shutil.rmtree("sources") - Repo.clone_from(source_url, to_path="sources", depth=1) - - -class SchemaLoader(object): + if os.path.exists("_sources"): + shutil.rmtree("_sources") + Repo.clone_from( + "https://github.com/openMetadataInitiative/openMINDS.git", + to_path="_sources/schemas", + depth=1, + ) + Repo.clone_from( + "https://github.com/openMetadataInitiative/openMINDS_instances.git", + to_path="_sources/instances", + depth=1, + ) + + +class SchemaLoader: def __init__(self): self._root_directory = os.path.realpath(".") - self.schemas_sources = os.path.join(self._root_directory, "sources", "schemas") + self.schemas_sources = os.path.join(self._root_directory, "_sources/schemas", "schemas") def get_schema_versions(self) -> List[str]: return os.listdir(self.schemas_sources) def find_schemas(self, version: str) -> List[str]: - return glob.glob(os.path.join(self.schemas_sources, version, f"**/*.schema.omi.json"), recursive=True) + return glob.glob( + os.path.join(self.schemas_sources, version, f"**/*.schema.omi.json"), + recursive=True, + ) + + +class InstanceLoader: + def __init__(self): + self._root_directory = os.path.realpath(".") + self.instances_sources = os.path.join( + self._root_directory, "_sources/instances", "instances" + ) + + def get_instance_versions(self) -> List[str]: + return os.listdir(self.instances_sources) + + def find_instances(self, version: str) -> List[str]: + return glob.glob( + os.path.join(self.instances_sources, version, f"**/*.jsonld"), + recursive=True, + )