Skip to content

Commit d7059c2

Browse files
author
Ethan Pailes
committed
Add random testing of crates.io regex
This patch adds some infastructure to scrape crates.io for regex, then run each of the regex found in this way though a random testing gauntlet to make sure that all the different backends behave in the same way. These random tests are expensive, so we only compile them in when the magic `RUST_REGEX_RANDOM_TEST` environment variable is set. In debug mode, these tests take quite a while, so we special case them in CI to run in release mode. Cargo does not support per-target profiles and `build.rs` does not seem to be allowed to set any rustc flags except for linking flags, so we have to manually do it in the CI script. To make this better we should add something which can generate a matching string from a regex. As is we just focus on the negative case. There is one bug that this uncovered that this patch does not fixed. A minimal version of it is commented out in the `tests/test_crates_regex.rs` file.
1 parent d107c80 commit d7059c2

File tree

7 files changed

+3513
-3
lines changed

7 files changed

+3513
-3
lines changed

Cargo.toml

+6
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,12 @@ name = "backtrack-utf8bytes"
108108
path = "tests/test_backtrack_bytes.rs"
109109
name = "backtrack-bytes"
110110

111+
# Run all backends against each regex found on crates.io and make sure
112+
# that they all do the same thing.
113+
[[test]]
114+
path = "tests/test_crates_regex.rs"
115+
name = "crates-regex"
116+
111117
[profile.release]
112118
debug = true
113119

HACKING.md

+11
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,11 @@ matching engine we want to test. The entry points are:
249249
backtracking on every regex and use *arbitrary* byte based programs.
250250
* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use
251251
backtracking on every regex and use *UTF-8* byte based programs.
252+
* `tests/test_crates_regex.rs` - tests to make sure that all of the
253+
backends behave in the same way against a number of quickcheck
254+
generated random inputs. These tests need to be enabled through
255+
the `RUST_REGEX_RANDOM_TEST` environment variable (see
256+
below).
252257

253258
The lazy DFA and pure literal engines are absent from this list because
254259
they cannot be used on every regular expression. Instead, we rely on
@@ -259,6 +264,12 @@ entry points, it can take a while to compile everything. To reduce compile
259264
times slightly, try using `cargo test --test default`, which will only use the
260265
`tests/test_default.rs` entry point.
261266

267+
The random testing takes quite a while, so it is not enabled by default.
268+
In order to run the random testing you can set the
269+
`RUST_REGEX_RANDOM_TEST` environment variable to anything before
270+
invoking `cargo test`. Note that this variable is inspected at compile
271+
time, so if the tests don't seem to be running, you may need to run
272+
`cargo clean`.
262273

263274
## Benchmarking
264275

ci/script.sh

+9-3
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,18 @@ cargo build --verbose
99
cargo doc --verbose
1010

1111
# Run tests. If we have nightly, then enable our nightly features.
12+
CARGO_TEST_EXTRA_FLAGS=""
1213
if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
13-
cargo test --verbose --features unstable
14-
else
15-
cargo test --verbose
14+
CARGO_TEST_EXTRA_FLAGS="--features unstable"
1615
fi
1716

17+
cargo test --verbose ${CARGO_TEST_EXTRA_FLAGS}
18+
19+
# Run the random tests in release mode, as this is faster.
20+
RUST_REGEX_RANDOM_TEST=1 \
21+
cargo test --release --verbose \
22+
${CARGO_TEST_EXTRA_FLAGS} --test crates-regex
23+
1824
# Run a test that confirms the shootout benchmarks are correct.
1925
ci/run-shootout-test
2026

scripts/scrape_crates_io.py

+188
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#!/usr/bin/env python3
2+
3+
from subprocess import call
4+
import argparse
5+
import datetime
6+
import glob
7+
import json
8+
import os
9+
import re
10+
import shutil
11+
import tempfile
12+
import time
13+
import urllib3
14+
15+
CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git"
16+
RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)")
17+
KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "claude"])
18+
19+
# if only requests was in the standard library...
20+
urllib3.disable_warnings()
21+
http = urllib3.PoolManager()
22+
23+
24+
def argparser():
25+
p = argparse.ArgumentParser("A script to scrape crates.io for regex.")
26+
p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR",
27+
help=("A directory where we can find crates.io-index "
28+
+ "(if this isn't set it will be automatically "
29+
+ "downloaded)."))
30+
p.add_argument("-o", "--output-file", metavar="OUTPUT",
31+
default="crates_regex.rs",
32+
help="The name of the output file to create.")
33+
return p
34+
35+
36+
PRELUDE = """// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
37+
// file at the top-level directory of this distribution and at
38+
// http://rust-lang.org/COPYRIGHT.
39+
//
40+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
41+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
42+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
43+
// option. This file may not be copied, modified, or distributed
44+
// except according to those terms.
45+
46+
// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py'
47+
// on {date}.
48+
49+
50+
51+
"""
52+
53+
54+
def main():
55+
args = argparser().parse_args()
56+
out = open(os.path.abspath(args.output_file), "w")
57+
out.write(PRELUDE.format(date=str(datetime.datetime.now())))
58+
args.crates_index = os.path.abspath(args.crates_index)
59+
60+
# enter our scratch directory
61+
old_dir = os.getcwd()
62+
work_dir = tempfile.mkdtemp(prefix="scrape-crates-io")
63+
os.chdir(work_dir)
64+
65+
crates_index = (args.crates_index
66+
if args.crates_index
67+
else download_crates_index())
68+
69+
for (name, vers) in iter_crates(crates_index):
70+
if name in KNOWN_UNMAINTAINED_CRATES:
71+
continue
72+
73+
with Crate(work_dir, name, vers) as c:
74+
i = 0
75+
for line in c.iter_lines():
76+
for r in RE_REGEX.findall(line):
77+
print((name, vers, r))
78+
if len(r) >= 2 and r[-2] == "\\":
79+
continue
80+
out.write("// {}-{}: {}\n".format(name, vers, r))
81+
out.write("consistent!({}_{}, {});\n\n".format(
82+
name.replace("-", "_"), i, r))
83+
out.flush()
84+
i += 1
85+
86+
# Leave the scratch directory
87+
os.chdir(old_dir)
88+
shutil.rmtree(work_dir)
89+
out.close()
90+
91+
92+
def download_crates_index():
93+
if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0:
94+
print("Error cloning the crates.io index")
95+
exit(1)
96+
return "crates.io-index"
97+
98+
99+
def iter_crates(crates_index):
100+
exclude = set(["config.json", ".git"])
101+
for crate_index_file in iter_files(crates_index, exclude=exclude):
102+
with open(crate_index_file) as f:
103+
most_recent = list(f)
104+
most_recent = most_recent[len(most_recent) - 1]
105+
106+
crate_info = json.loads(most_recent)
107+
if "regex" not in set(d["name"] for d in crate_info["deps"]):
108+
continue
109+
110+
if crate_info["yanked"]:
111+
continue
112+
yield (crate_info["name"], crate_info["vers"])
113+
114+
115+
def iter_files(d, exclude=set()):
116+
for x in os.listdir(d):
117+
if x in exclude:
118+
continue
119+
120+
fullfp = os.path.abspath(d + "/" + x)
121+
if os.path.isfile(fullfp):
122+
yield fullfp
123+
elif os.path.isdir(fullfp):
124+
for f in iter_files(fullfp, exclude):
125+
yield f
126+
127+
128+
class Crate(object):
129+
def __init__(self, work_dir, name, version):
130+
self.name = name
131+
self.version = version
132+
self.url = ("https://crates.io/api/v1/crates/{name}/{version}/download"
133+
.format(name=self.name, version=self.version))
134+
self.filename = "{}/{}-{}.tar.gz".format(
135+
work_dir, self.name, self.version)
136+
137+
def __enter__(self):
138+
max_retries = 1
139+
retries = 0
140+
while retries < max_retries:
141+
retries += 1
142+
143+
r = http.request("GET", self.url, preload_content=False)
144+
try:
145+
print("[{}/{}] Downloading {}".format(
146+
retries, max_retries + 1, self.url))
147+
with open(self.filename, "wb") as f:
148+
while True:
149+
data = r.read(1024)
150+
if not data:
151+
break
152+
f.write(data)
153+
except requests.exceptions.ConnectionError:
154+
time.sleep(1)
155+
r.release_conn()
156+
continue
157+
158+
r.release_conn()
159+
break
160+
161+
call(["tar", "-xf", self.filename])
162+
163+
return self
164+
165+
def __exit__(self, ty, value, tb):
166+
# We are going to clean up the whole temp dir anyway, so
167+
# we don't really need to do this. Its nice to clean up
168+
# after ourselves though.
169+
try:
170+
shutil.rmtree(self.filename[:-len(".tar.gz")])
171+
os.remove(self.filename)
172+
except _:
173+
pass
174+
175+
def iter_srcs(self):
176+
g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")])
177+
for rsrc in glob.iglob(g):
178+
yield rsrc
179+
180+
def iter_lines(self):
181+
for src in self.iter_srcs():
182+
with open(src) as f:
183+
for line in f:
184+
yield line
185+
186+
187+
if __name__ == "__main__":
188+
main()

0 commit comments

Comments
 (0)