Skip to content

Rust: adapt model generation to new format #19819

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jun 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ repos:
rev: 25.1.0
hooks:
- id: black
files: ^(misc/codegen/.*|misc/scripts/models-as-data/bulk_generate_mad)\.py$
files: ^(misc/codegen/.*|misc/scripts/models-as-data/.*)\.py$

- repo: local
hooks:
Expand Down
91 changes: 50 additions & 41 deletions misc/scripts/models-as-data/bulk_generate_mad.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Note: This file must be formatted using the Black Python formatter.
"""

import os.path
import pathlib
import subprocess
import sys
from typing import Required, TypedDict, List, Callable, Optional
Expand Down Expand Up @@ -41,7 +41,7 @@ def missing_module(module_name: str) -> None:
.decode("utf-8")
.strip()
)
build_dir = os.path.join(gitroot, "mad-generation-build")
build_dir = pathlib.Path(gitroot, "mad-generation-build")


# A project to generate models for
Expand Down Expand Up @@ -86,10 +86,10 @@ def clone_project(project: Project) -> str:
git_tag = project.get("git-tag")

# Determine target directory
target_dir = os.path.join(build_dir, name)
target_dir = build_dir / name

# Clone only if directory doesn't already exist
if not os.path.exists(target_dir):
if not target_dir.exists():
if git_tag:
print(f"Cloning {name} from {repo_url} at tag {git_tag}")
else:
Expand Down Expand Up @@ -191,10 +191,10 @@ def build_database(
name = project["name"]

# Create database directory path
database_dir = os.path.join(build_dir, f"{name}-db")
database_dir = build_dir / f"{name}-db"

# Only build the database if it doesn't already exist
if not os.path.exists(database_dir):
if not database_dir.exists():
print(f"Building CodeQL database for {name}...")
extractor_options = [option for x in extractor_options for option in ("-O", x)]
try:
Expand Down Expand Up @@ -236,13 +236,16 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
language = config["language"]

generator = mad.Generator(language)
# Note: The argument parser converts with-sinks to with_sinks, etc.
generator.generateSinks = should_generate_sinks(project)
generator.generateSources = should_generate_sources(project)
generator.generateSummaries = should_generate_summaries(project)
generator.setenvironment(database=database_dir, folder=name)
generator.with_sinks = should_generate_sinks(project)
generator.with_sources = should_generate_sources(project)
generator.with_summaries = should_generate_summaries(project)
generator.threads = args.codeql_threads
generator.ram = args.codeql_ram
if config.get("single-file", False):
generator.single_file = name
else:
generator.folder = name
generator.setenvironment(database=database_dir)
generator.run()


Expand Down Expand Up @@ -313,20 +316,14 @@ def download_artifact(url: str, artifact_name: str, pat: str) -> str:
if response.status_code != 200:
print(f"Failed to download file. Status code: {response.status_code}")
sys.exit(1)
target_zip = os.path.join(build_dir, zipName)
target_zip = build_dir / zipName
with open(target_zip, "wb") as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"Download complete: {target_zip}")
return target_zip


def remove_extension(filename: str) -> str:
while "." in filename:
filename, _ = os.path.splitext(filename)
return filename


def pretty_name_from_artifact_name(artifact_name: str) -> str:
return artifact_name.split("___")[1]

Expand All @@ -348,7 +345,7 @@ def download_dca_databases(
"""
print("\n=== Finding projects ===")
project_map = {project["name"]: project for project in projects}
analyzed_databases = {}
analyzed_databases = {n: None for n in project_map}
for experiment_name in experiment_names:
response = get_json_from_github(
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
Expand All @@ -361,17 +358,24 @@ def download_dca_databases(
artifact_name = analyzed_database["artifact_name"]
pretty_name = pretty_name_from_artifact_name(artifact_name)

if not pretty_name in project_map:
if not pretty_name in analyzed_databases:
print(f"Skipping {pretty_name} as it is not in the list of projects")
continue

if pretty_name in analyzed_databases:
if analyzed_databases[pretty_name] is not None:
print(
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
)

analyzed_databases[pretty_name] = analyzed_database

not_found = [name for name, db in analyzed_databases.items() if db is None]
if not_found:
print(
f"ERROR: The following projects were not found in the DCA experiments: {', '.join(not_found)}"
)
sys.exit(1)

def download_and_decompress(analyzed_database: dict) -> str:
artifact_name = analyzed_database["artifact_name"]
repository = analyzed_database["repository"]
Expand All @@ -393,19 +397,17 @@ def download_and_decompress(analyzed_database: dict) -> str:
# The database is in a zip file, which contains a tar.gz file with the DB
# First we open the zip file
with zipfile.ZipFile(artifact_zip_location, "r") as zip_ref:
artifact_unzipped_location = os.path.join(build_dir, artifact_name)
artifact_unzipped_location = build_dir / artifact_name
# clean up any remnants of previous runs
shutil.rmtree(artifact_unzipped_location, ignore_errors=True)
# And then we extract it to build_dir/artifact_name
zip_ref.extractall(artifact_unzipped_location)
# And then we extract the language tar.gz file inside it
artifact_tar_location = os.path.join(
artifact_unzipped_location, f"{language}.tar.gz"
)
artifact_tar_location = artifact_unzipped_location / f"{language}.tar.gz"
with tarfile.open(artifact_tar_location, "r:gz") as tar_ref:
# And we just untar it to the same directory as the zip file
tar_ref.extractall(artifact_unzipped_location)
ret = os.path.join(artifact_unzipped_location, language)
ret = artifact_unzipped_location / language
print(f"Decompression complete: {ret}")
return ret

Expand All @@ -425,8 +427,16 @@ def download_and_decompress(analyzed_database: dict) -> str:
return [(project_map[n], r) for n, r in zip(analyzed_databases, results)]


def get_mad_destination_for_project(config, name: str) -> str:
return os.path.join(config["destination"], name)
def clean_up_mad_destination_for_project(config, name: str):
target = pathlib.Path(config["destination"], name)
if config.get("single-file", False):
target = target.with_suffix(".model.yml")
if target.exists():
print(f"Deleting existing MaD file at {target}")
target.unlink()
elif target.exists():
print(f"Deleting existing MaD directory at {target}")
shutil.rmtree(target, ignore_errors=True)


def get_strategy(config) -> str:
Expand All @@ -448,8 +458,7 @@ def main(config, args) -> None:
language = config["language"]

# Create build directory if it doesn't exist
if not os.path.exists(build_dir):
os.makedirs(build_dir)
build_dir.mkdir(parents=True, exist_ok=True)

database_results = []
match get_strategy(config):
Expand All @@ -469,7 +478,7 @@ def main(config, args) -> None:
if args.pat is None:
print("ERROR: --pat argument is required for DCA strategy")
sys.exit(1)
if not os.path.exists(args.pat):
if not args.pat.exists():
print(f"ERROR: Personal Access Token file '{pat}' does not exist.")
sys.exit(1)
with open(args.pat, "r") as f:
Expand All @@ -493,12 +502,9 @@ def main(config, args) -> None:
)
sys.exit(1)

# Delete the MaD directory for each project
for project, database_dir in database_results:
mad_dir = get_mad_destination_for_project(config, project["name"])
if os.path.exists(mad_dir):
print(f"Deleting existing MaD directory at {mad_dir}")
subprocess.check_call(["rm", "-rf", mad_dir])
# clean up existing MaD data for the projects
for project, _ in database_results:
clean_up_mad_destination_for_project(config, project["name"])

for project, database_dir in database_results:
if database_dir is not None:
Expand All @@ -508,7 +514,10 @@ def main(config, args) -> None:
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--config", type=str, help="Path to the configuration file.", required=True
"--config",
type=pathlib.Path,
help="Path to the configuration file.",
required=True,
)
parser.add_argument(
"--dca",
Expand All @@ -519,13 +528,13 @@ def main(config, args) -> None:
)
parser.add_argument(
"--pat",
type=str,
type=pathlib.Path,
help="Path to a file containing the PAT token required to grab DCA databases (the same as the one you use for DCA)",
)
parser.add_argument(
"--codeql-ram",
type=int,
help="What `--ram` value to pass to `codeql` while generating models (by default the flag is not passed)",
help="What `--ram` value to pass to `codeql` while generating models (by default 2048 MB per thread)",
default=None,
)
parser.add_argument(
Expand All @@ -538,7 +547,7 @@ def main(config, args) -> None:

# Load config file
config = {}
if not os.path.exists(args.config):
if not args.config.exists():
print(f"ERROR: Config file '{args.config}' does not exist.")
sys.exit(1)
try:
Expand Down
59 changes: 41 additions & 18 deletions misc/scripts/models-as-data/convert_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,65 +7,86 @@
import sys
import tempfile


def quote_if_needed(v):
# string columns
if type(v) is str:
return "\"" + v + "\""
return '"' + v + '"'
# bool column
return str(v)


def parseData(data):
rows = [{ }, { }]
rows = [{}, {}]
for row in data:
d = map(quote_if_needed, row)
provenance = row[-1]
targetRows = rows[1] if provenance.endswith("generated") else rows[0]
helpers.insert_update(targetRows, row[0], " - [" + ', '.join(d) + ']\n')
helpers.insert_update(targetRows, row[0], " - [" + ", ".join(d) + "]\n")

return rows


class Converter:
def __init__(self, language, dbDir):
self.language = language
self.dbDir = dbDir
self.codeQlRoot = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()
self.codeQlRoot = (
subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
.decode("utf-8")
.strip()
)
self.extDir = os.path.join(self.codeQlRoot, f"{self.language}/ql/lib/ext/")
self.dirname = "modelconverter"
self.modelFileExtension = ".model.yml"
self.workDir = tempfile.mkdtemp()


def runQuery(self, query):
print('########## Querying: ', query)
queryFile = os.path.join(self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query)
print("########## Querying: ", query)
queryFile = os.path.join(
self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query
)
resultBqrs = os.path.join(self.workDir, "out.bqrs")

helpers.run_cmd(['codeql', 'query', 'run', queryFile, '--database', self.dbDir, '--output', resultBqrs], "Failed to generate " + query)
helpers.run_cmd(
[
"codeql",
"query",
"run",
queryFile,
"--database",
self.dbDir,
"--output",
resultBqrs,
],
"Failed to generate " + query,
)
return helpers.readData(self.workDir, resultBqrs)


def asAddsTo(self, rows, predicate):
extensions = [{ }, { }]
extensions = [{}, {}]
for i in range(2):
for key in rows[i]:
extensions[i][key] = helpers.addsToTemplate.format(f"codeql/{self.language}-all", predicate, rows[i][key])

return extensions
extensions[i][key] = helpers.addsToTemplate.format(
f"codeql/{self.language}-all", predicate, rows[i][key]
)

return extensions

def getAddsTo(self, query, predicate):
data = self.runQuery(query)
rows = parseData(data)
return self.asAddsTo(rows, predicate)


def makeContent(self):
summaries = self.getAddsTo("ExtractSummaries.ql", helpers.summaryModelPredicate)
sources = self.getAddsTo("ExtractSources.ql", helpers.sourceModelPredicate)
sinks = self.getAddsTo("ExtractSinks.ql", helpers.sinkModelPredicate)
neutrals = self.getAddsTo("ExtractNeutrals.ql", helpers.neutralModelPredicate)
return [helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]), helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1])]

return [
helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]),
helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1]),
]

def save(self, extensions):
# Create directory if it doesn't exist
Expand All @@ -77,9 +98,11 @@ def save(self, extensions):
for entry in extensions[0]:
with open(self.extDir + "/" + entry + self.modelFileExtension, "w") as f:
f.write(extensionTemplate.format(extensions[0][entry]))

for entry in extensions[1]:
with open(self.extDir + "/generated/" + entry + self.modelFileExtension, "w") as f:
with open(
self.extDir + "/generated/" + entry + self.modelFileExtension, "w"
) as f:
f.write(extensionTemplate.format(extensions[1][entry]))

def run(self):
Expand Down
Loading
Loading