github · redsun82 · Jun 23, 2025 · Jun 11, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ repos:
     rev: 25.1.0
     hooks:
       - id: black
-        files: ^(misc/codegen/.*|misc/scripts/models-as-data/bulk_generate_mad)\.py$
+        files: ^(misc/codegen/.*|misc/scripts/models-as-data/.*)\.py$
 
   - repo: local
     hooks:

diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py
@@ -5,7 +5,7 @@
 Note: This file must be formatted using the Black Python formatter.
 """
 
-import os.path
+import pathlib
 import subprocess
 import sys
 from typing import Required, TypedDict, List, Callable, Optional
@@ -41,7 +41,7 @@ def missing_module(module_name: str) -> None:
     .decode("utf-8")
     .strip()
 )
-build_dir = os.path.join(gitroot, "mad-generation-build")
+build_dir = pathlib.Path(gitroot, "mad-generation-build")
 
 
 # A project to generate models for
@@ -86,10 +86,10 @@ def clone_project(project: Project) -> str:
     git_tag = project.get("git-tag")
 
     # Determine target directory
-    target_dir = os.path.join(build_dir, name)
+    target_dir = build_dir / name
 
     # Clone only if directory doesn't already exist
-    if not os.path.exists(target_dir):
+    if not target_dir.exists():
         if git_tag:
             print(f"Cloning {name} from {repo_url} at tag {git_tag}")
         else:
@@ -191,10 +191,10 @@ def build_database(
     name = project["name"]
 
     # Create database directory path
-    database_dir = os.path.join(build_dir, f"{name}-db")
+    database_dir = build_dir / f"{name}-db"
 
     # Only build the database if it doesn't already exist
-    if not os.path.exists(database_dir):
+    if not database_dir.exists():
         print(f"Building CodeQL database for {name}...")
         extractor_options = [option for x in extractor_options for option in ("-O", x)]
         try:
@@ -236,13 +236,16 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
     language = config["language"]
 
     generator = mad.Generator(language)
-    # Note: The argument parser converts with-sinks to with_sinks, etc.
-    generator.generateSinks = should_generate_sinks(project)
-    generator.generateSources = should_generate_sources(project)
-    generator.generateSummaries = should_generate_summaries(project)
-    generator.setenvironment(database=database_dir, folder=name)
+    generator.with_sinks = should_generate_sinks(project)
+    generator.with_sources = should_generate_sources(project)
+    generator.with_summaries = should_generate_summaries(project)
     generator.threads = args.codeql_threads
     generator.ram = args.codeql_ram
+    if config.get("single-file", False):
+        generator.single_file = name
+    else:
+        generator.folder = name
+    generator.setenvironment(database=database_dir)
     generator.run()
 
 
@@ -313,20 +316,14 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str:
     if response.status_code != 200:
         print(f"Failed to download file. Status code: {response.status_code}")
         sys.exit(1)
-    target_zip = os.path.join(build_dir, zipName)
+    target_zip = build_dir / zipName
     with open(target_zip, "wb") as file:
         for chunk in response.iter_content(chunk_size=8192):
             file.write(chunk)
     print(f"Download complete: {target_zip}")
     return target_zip
 
 
-def remove_extension(filename: str) -> str:
-    while "." in filename:
-        filename, _ = os.path.splitext(filename)
-    return filename
-
-
 def pretty_name_from_artifact_name(artifact_name: str) -> str:
     return artifact_name.split("___")[1]
 
@@ -348,7 +345,7 @@ def download_dca_databases(
     """
     print("\n=== Finding projects ===")
     project_map = {project["name"]: project for project in projects}
-    analyzed_databases = {}
+    analyzed_databases = {n: None for n in project_map}
     for experiment_name in experiment_names:
         response = get_json_from_github(
             f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -361,17 +358,24 @@ def download_dca_databases(
             artifact_name = analyzed_database["artifact_name"]
             pretty_name = pretty_name_from_artifact_name(artifact_name)
 
-            if not pretty_name in project_map:
+            if not pretty_name in analyzed_databases:
                 print(f"Skipping {pretty_name} as it is not in the list of projects")
                 continue
 
-            if pretty_name in analyzed_databases:
+            if analyzed_databases[pretty_name] is not None:
                 print(
                     f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
                 )
 
             analyzed_databases[pretty_name] = analyzed_database
 
+    not_found = [name for name, db in analyzed_databases.items() if db is None]
+    if not_found:
+        print(
+            f"ERROR: The following projects were not found in the DCA experiments: {', '.join(not_found)}"
+        )
+        sys.exit(1)
+
     def download_and_decompress(analyzed_database: dict) -> str:
         artifact_name = analyzed_database["artifact_name"]
         repository = analyzed_database["repository"]
@@ -393,19 +397,17 @@ def download_and_decompress(analyzed_database: dict) -> str:
         # The database is in a zip file, which contains a tar.gz file with the DB
         # First we open the zip file
         with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
-            artifact_unzipped_location = os.path.join(build_dir, artifact_name)
+            artifact_unzipped_location = build_dir / artifact_name
             # clean up any remnants of previous runs
             shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
             # And then we extract it to build_dir/artifact_name
             zip_ref.extractall(artifact_unzipped_location)
             # And then we extract the language tar.gz file inside it
-            artifact_tar_location = os.path.join(
-                artifact_unzipped_location, f"{language}.tar.gz"
-            )
+            artifact_tar_location = artifact_unzipped_location / f"{language}.tar.gz"
             with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
                 # And we just untar it to the same directory as the zip file
                 tar_ref.extractall(artifact_unzipped_location)
-        ret = os.path.join(artifact_unzipped_location, language)
+        ret = artifact_unzipped_location / language
         print(f"Decompression complete: {ret}")
         return ret
 
@@ -425,8 +427,16 @@ def download_and_decompress(analyzed_database: dict) -> str:
     return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]
 
 
-def get_mad_destination_for_project(config, name: str) -> str:
-    return os.path.join(config["destination"], name)
+def clean_up_mad_destination_for_project(config, name: str):
+    target = pathlib.Path(config["destination"], name)
+    if config.get("single-file", False):
+        target = target.with_suffix(".model.yml")
+        if target.exists():
+            print(f"Deleting existing MaD file at {target}")
+            target.unlink()
+    elif target.exists():
+        print(f"Deleting existing MaD directory at {target}")
+        shutil.rmtree(target, ignore_errors=True)
 
 
 def get_strategy(config) -> str:
@@ -448,8 +458,7 @@ def main(config, args) -> None:
     language = config["language"]
 
     # Create build directory if it doesn't exist
-    if not os.path.exists(build_dir):
-        os.makedirs(build_dir)
+    build_dir.mkdir(parents=True, exist_ok=True)
 
     database_results = []
     match get_strategy(config):
@@ -469,7 +478,7 @@ def main(config, args) -> None:
             if args.pat is None:
                 print("ERROR: --pat argument is required for DCA strategy")
                 sys.exit(1)
-            if not os.path.exists(args.pat):
+            if not args.pat.exists():
                 print(f"ERROR: Personal Access Token file '{pat}' does not exist.")
                 sys.exit(1)
             with open(args.pat, "r") as f:
@@ -493,12 +502,9 @@ def main(config, args) -> None:
         )
         sys.exit(1)
 
-    # Delete the MaD directory for each project
-    for project, database_dir in database_results:
-        mad_dir = get_mad_destination_for_project(config, project["name"])
-        if os.path.exists(mad_dir):
-            print(f"Deleting existing MaD directory at {mad_dir}")
-            subprocess.check_call(["rm", "-rf", mad_dir])
+    # clean up existing MaD data for the projects
+    for project, _ in database_results:
+        clean_up_mad_destination_for_project(config, project["name"])
 
     for project, database_dir in database_results:
         if database_dir is not None:
@@ -508,7 +514,10 @@ def main(config, args) -> None:
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--config", type=str, help="Path to the configuration file.", required=True
+        "--config",
+        type=pathlib.Path,
+        help="Path to the configuration file.",
+        required=True,
     )
     parser.add_argument(
         "--dca",
@@ -519,13 +528,13 @@ def main(config, args) -> None:
     )
     parser.add_argument(
         "--pat",
-        type=str,
+        type=pathlib.Path,
         help="Path to a file containing the PAT token required to grab DCA databases (the same as the one you use for DCA)",
     )
     parser.add_argument(
         "--codeql-ram",
         type=int,
-        help="What `--ram` value to pass to `codeql` while generating models (by default the flag is not passed)",
+        help="What `--ram` value to pass to `codeql` while generating models (by default 2048 MB per thread)",
         default=None,
     )
     parser.add_argument(
@@ -538,7 +547,7 @@ def main(config, args) -> None:
 
     # Load config file
     config = {}
-    if not os.path.exists(args.config):
+    if not args.config.exists():
         print(f"ERROR: Config file '{args.config}' does not exist.")
         sys.exit(1)
     try:

diff --git a/misc/scripts/models-as-data/convert_extensions.py b/misc/scripts/models-as-data/convert_extensions.py
@@ -7,65 +7,86 @@
 import sys
 import tempfile
 
+
 def quote_if_needed(v):
     # string columns
     if type(v) is str:
-        return "\"" + v + "\""
+        return '"' + v + '"'
     # bool column
     return str(v)
 
+
 def parseData(data):
-    rows = [{ }, { }]
+    rows = [{}, {}]
     for row in data:
         d = map(quote_if_needed, row)
         provenance = row[-1]
         targetRows = rows[1] if provenance.endswith("generated") else rows[0]
-        helpers.insert_update(targetRows, row[0], "      - [" + ', '.join(d) + ']\n')
+        helpers.insert_update(targetRows, row[0], "      - [" + ", ".join(d) + "]\n")
 
     return rows
 
+
 class Converter:
     def __init__(self, language, dbDir):
         self.language = language
         self.dbDir = dbDir
-        self.codeQlRoot = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()
+        self.codeQlRoot = (
+            subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
+            .decode("utf-8")
+            .strip()
+        )
         self.extDir = os.path.join(self.codeQlRoot, f"{self.language}/ql/lib/ext/")
         self.dirname = "modelconverter"
         self.modelFileExtension = ".model.yml"
         self.workDir = tempfile.mkdtemp()
 
-
     def runQuery(self, query):
-        print('########## Querying: ', query)
-        queryFile = os.path.join(self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query)
+        print("########## Querying: ", query)
+        queryFile = os.path.join(
+            self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query
+        )
         resultBqrs = os.path.join(self.workDir, "out.bqrs")
 
-        helpers.run_cmd(['codeql', 'query', 'run', queryFile, '--database', self.dbDir, '--output', resultBqrs], "Failed to generate " + query)
+        helpers.run_cmd(
+            [
+                "codeql",
+                "query",
+                "run",
+                queryFile,
+                "--database",
+                self.dbDir,
+                "--output",
+                resultBqrs,
+            ],
+            "Failed to generate " + query,
+        )
         return helpers.readData(self.workDir, resultBqrs)
 
-
     def asAddsTo(self, rows, predicate):
-        extensions = [{ }, { }]
+        extensions = [{}, {}]
         for i in range(2):
             for key in rows[i]:
-                extensions[i][key] = helpers.addsToTemplate.format(f"codeql/{self.language}-all", predicate, rows[i][key])
-
-        return extensions
+                extensions[i][key] = helpers.addsToTemplate.format(
+                    f"codeql/{self.language}-all", predicate, rows[i][key]
+                )
 
+        return extensions
 
     def getAddsTo(self, query, predicate):
         data = self.runQuery(query)
         rows = parseData(data)
         return self.asAddsTo(rows, predicate)
 
-
     def makeContent(self):
         summaries = self.getAddsTo("ExtractSummaries.ql", helpers.summaryModelPredicate)
         sources = self.getAddsTo("ExtractSources.ql", helpers.sourceModelPredicate)
         sinks = self.getAddsTo("ExtractSinks.ql", helpers.sinkModelPredicate)
         neutrals = self.getAddsTo("ExtractNeutrals.ql", helpers.neutralModelPredicate)
-        return [helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]), helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1])]
-
+        return [
+            helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]),
+            helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1]),
+        ]
 
     def save(self, extensions):
         # Create directory if it doesn't exist
@@ -77,9 +98,11 @@ def save(self, extensions):
         for entry in extensions[0]:
             with open(self.extDir + "/" + entry + self.modelFileExtension, "w") as f:
                 f.write(extensionTemplate.format(extensions[0][entry]))
-        
+
         for entry in extensions[1]:
-            with open(self.extDir + "/generated/" + entry + self.modelFileExtension, "w") as f:
+            with open(
+                self.extDir + "/generated/" + entry + self.modelFileExtension, "w"
+            ) as f:
                 f.write(extensionTemplate.format(extensions[1][entry]))
 
     def run(self):