Skip to content

Add random testing of crates.io regex #472

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 21, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ name = "backtrack-utf8bytes"
path = "tests/test_backtrack_bytes.rs"
name = "backtrack-bytes"

# Run all backends against each regex found on crates.io and make sure
# that they all do the same thing.
[[test]]
path = "tests/test_crates_regex.rs"
name = "crates-regex"

[profile.release]
debug = true

Expand Down
11 changes: 11 additions & 0 deletions HACKING.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,11 @@ matching engine we want to test. The entry points are:
backtracking on every regex and use *arbitrary* byte based programs.
* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use
backtracking on every regex and use *UTF-8* byte based programs.
* `tests/test_crates_regex.rs` - tests to make sure that all of the
backends behave in the same way against a number of quickcheck
generated random inputs. These tests need to be enabled through
the `RUST_REGEX_RANDOM_TEST` environment variable (see
below).

The lazy DFA and pure literal engines are absent from this list because
they cannot be used on every regular expression. Instead, we rely on
Expand All @@ -259,6 +264,12 @@ entry points, it can take a while to compile everything. To reduce compile
times slightly, try using `cargo test --test default`, which will only use the
`tests/test_default.rs` entry point.

The random testing takes quite a while, so it is not enabled by default.
In order to run the random testing you can set the
`RUST_REGEX_RANDOM_TEST` environment variable to anything before
invoking `cargo test`. Note that this variable is inspected at compile
time, so if the tests don't seem to be running, you may need to run
`cargo clean`.

## Benchmarking

Expand Down
13 changes: 10 additions & 3 deletions ci/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,18 @@ cargo build --verbose
cargo doc --verbose

# Run tests. If we have nightly, then enable our nightly features.
# Right now there are no nightly features, but that may change in the
# future.
CARGO_TEST_EXTRA_FLAGS=""
if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
cargo test --verbose --features unstable
else
cargo test --verbose
CARGO_TEST_EXTRA_FLAGS=""
fi
cargo test --verbose ${CARGO_TEST_EXTRA_FLAGS}

# Run the random tests in release mode, as this is faster.
RUST_REGEX_RANDOM_TEST=1 \
cargo test --release --verbose \
${CARGO_TEST_EXTRA_FLAGS} --test crates-regex

# Run a test that confirms the shootout benchmarks are correct.
ci/run-shootout-test
Expand Down
189 changes: 189 additions & 0 deletions scripts/scrape_crates_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#!/usr/bin/env python3

from subprocess import call
import argparse
import datetime
import glob
import json
import os
import re
import shutil
import tempfile
import time
import urllib3

CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git"
RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)")
KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "oozz"])

# if only requests was in the standard library...
urllib3.disable_warnings()
http = urllib3.PoolManager()


def argparser():
p = argparse.ArgumentParser("A script to scrape crates.io for regex.")
p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR",
help=("A directory where we can find crates.io-index "
+ "(if this isn't set it will be automatically "
+ "downloaded)."))
p.add_argument("-o", "--output-file", metavar="OUTPUT",
default="crates_regex.rs",
help="The name of the output file to create.")
return p


PRELUDE = """// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py'
// on {date}.



"""


def main():
args = argparser().parse_args()
out = open(os.path.abspath(args.output_file), "w")
out.write(PRELUDE.format(date=str(datetime.datetime.now())))
if args.crates_index:
args.crates_index = os.path.abspath(args.crates_index)

# enter our scratch directory
old_dir = os.getcwd()
work_dir = tempfile.mkdtemp(prefix="scrape-crates-io")
os.chdir(work_dir)

crates_index = (args.crates_index
if os.path.join(old_dir, args.crates_index)
else download_crates_index())

for (name, vers) in iter_crates(crates_index):
if name in KNOWN_UNMAINTAINED_CRATES:
continue

with Crate(work_dir, name, vers) as c:
i = 0
for line in c.iter_lines():
for r in RE_REGEX.findall(line):
print((name, vers, r))
if len(r) >= 2 and r[-2] == "\\":
continue
out.write("// {}-{}: {}\n".format(name, vers, r))
out.write("consistent!({}_{}, {});\n\n".format(
name.replace("-", "_"), i, r))
out.flush()
i += 1

# Leave the scratch directory
os.chdir(old_dir)
shutil.rmtree(work_dir)
out.close()


def download_crates_index():
if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0:
print("Error cloning the crates.io index")
exit(1)
return "crates.io-index"


def iter_crates(crates_index):
exclude = set(["config.json", ".git"])
for crate_index_file in iter_files(crates_index, exclude=exclude):
with open(crate_index_file) as f:
most_recent = list(f)
most_recent = most_recent[len(most_recent) - 1]

crate_info = json.loads(most_recent)
if "regex" not in set(d["name"] for d in crate_info["deps"]):
continue

if crate_info["yanked"]:
continue
yield (crate_info["name"], crate_info["vers"])


def iter_files(d, exclude=set()):
for x in os.listdir(d):
if x in exclude:
continue

fullfp = os.path.abspath(d + "/" + x)
if os.path.isfile(fullfp):
yield fullfp
elif os.path.isdir(fullfp):
for f in iter_files(fullfp, exclude):
yield f


class Crate(object):
def __init__(self, work_dir, name, version):
self.name = name
self.version = version
self.url = ("https://crates.io/api/v1/crates/{name}/{version}/download"
.format(name=self.name, version=self.version))
self.filename = "{}/{}-{}.tar.gz".format(
work_dir, self.name, self.version)

def __enter__(self):
max_retries = 1
retries = 0
while retries < max_retries:
retries += 1

r = http.request("GET", self.url, preload_content=False)
try:
print("[{}/{}] Downloading {}".format(
retries, max_retries + 1, self.url))
with open(self.filename, "wb") as f:
while True:
data = r.read(1024)
if not data:
break
f.write(data)
except requests.exceptions.ConnectionError:
time.sleep(1)
r.release_conn()
continue

r.release_conn()
break

call(["tar", "-xf", self.filename])

return self

def __exit__(self, ty, value, tb):
# We are going to clean up the whole temp dir anyway, so
# we don't really need to do this. Its nice to clean up
# after ourselves though.
try:
shutil.rmtree(self.filename[:-len(".tar.gz")])
os.remove(self.filename)
except _:
pass

def iter_srcs(self):
g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")])
for rsrc in glob.iglob(g):
yield rsrc

def iter_lines(self):
for src in self.iter_srcs():
with open(src) as f:
for line in f:
yield line


if __name__ == "__main__":
main()
Loading