Skip to content

Commit 5fd3bd1

Browse files
ethanpailesBurntSushi
ethanpailes
authored andcommitted
test: add random testing of crates.io regex
This patch adds some infastructure to scrape crates.io for regex, then run each of the regex found in this way though a random testing gauntlet to make sure that all the different backends behave in the same way. These random tests are expensive, so we only run them in when the magic `RUST_REGEX_RANDOM_TEST` environment variable is set. In debug mode, these tests take quite a while, so we special case them in CI to run in release mode. To make this better we should add something which can generate a matching string from a regex. As is we just focus on the negative case. There is one bug that this uncovered that this patch does not fixed. A minimal version of it is commented out in the `tests/test_crates_regex.rs` file. PR #472
1 parent d107c80 commit 5fd3bd1

File tree

7 files changed

+3615
-3
lines changed

7 files changed

+3615
-3
lines changed

Cargo.toml

+6
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,12 @@ name = "backtrack-utf8bytes"
108108
path = "tests/test_backtrack_bytes.rs"
109109
name = "backtrack-bytes"
110110

111+
# Run all backends against each regex found on crates.io and make sure
112+
# that they all do the same thing.
113+
[[test]]
114+
path = "tests/test_crates_regex.rs"
115+
name = "crates-regex"
116+
111117
[profile.release]
112118
debug = true
113119

HACKING.md

+11
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,11 @@ matching engine we want to test. The entry points are:
249249
backtracking on every regex and use *arbitrary* byte based programs.
250250
* `tests/test_backtrack_utf8bytes.rs` - tests `Regex::new`, forced to use
251251
backtracking on every regex and use *UTF-8* byte based programs.
252+
* `tests/test_crates_regex.rs` - tests to make sure that all of the
253+
backends behave in the same way against a number of quickcheck
254+
generated random inputs. These tests need to be enabled through
255+
the `RUST_REGEX_RANDOM_TEST` environment variable (see
256+
below).
252257

253258
The lazy DFA and pure literal engines are absent from this list because
254259
they cannot be used on every regular expression. Instead, we rely on
@@ -259,6 +264,12 @@ entry points, it can take a while to compile everything. To reduce compile
259264
times slightly, try using `cargo test --test default`, which will only use the
260265
`tests/test_default.rs` entry point.
261266

267+
The random testing takes quite a while, so it is not enabled by default.
268+
In order to run the random testing you can set the
269+
`RUST_REGEX_RANDOM_TEST` environment variable to anything before
270+
invoking `cargo test`. Note that this variable is inspected at compile
271+
time, so if the tests don't seem to be running, you may need to run
272+
`cargo clean`.
262273

263274
## Benchmarking
264275

ci/script.sh

+10-3
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,18 @@ cargo build --verbose
99
cargo doc --verbose
1010

1111
# Run tests. If we have nightly, then enable our nightly features.
12+
# Right now there are no nightly features, but that may change in the
13+
# future.
14+
CARGO_TEST_EXTRA_FLAGS=""
1215
if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
13-
cargo test --verbose --features unstable
14-
else
15-
cargo test --verbose
16+
CARGO_TEST_EXTRA_FLAGS=""
1617
fi
18+
cargo test --verbose ${CARGO_TEST_EXTRA_FLAGS}
19+
20+
# Run the random tests in release mode, as this is faster.
21+
RUST_REGEX_RANDOM_TEST=1 \
22+
cargo test --release --verbose \
23+
${CARGO_TEST_EXTRA_FLAGS} --test crates-regex
1724

1825
# Run a test that confirms the shootout benchmarks are correct.
1926
ci/run-shootout-test

scripts/scrape_crates_io.py

+189
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#!/usr/bin/env python3
2+
3+
from subprocess import call
4+
import argparse
5+
import datetime
6+
import glob
7+
import json
8+
import os
9+
import re
10+
import shutil
11+
import tempfile
12+
import time
13+
import urllib3
14+
15+
CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git"
16+
RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)")
17+
KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "oozz"])
18+
19+
# if only requests was in the standard library...
20+
urllib3.disable_warnings()
21+
http = urllib3.PoolManager()
22+
23+
24+
def argparser():
25+
p = argparse.ArgumentParser("A script to scrape crates.io for regex.")
26+
p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR",
27+
help=("A directory where we can find crates.io-index "
28+
+ "(if this isn't set it will be automatically "
29+
+ "downloaded)."))
30+
p.add_argument("-o", "--output-file", metavar="OUTPUT",
31+
default="crates_regex.rs",
32+
help="The name of the output file to create.")
33+
return p
34+
35+
36+
PRELUDE = """// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
37+
// file at the top-level directory of this distribution and at
38+
// http://rust-lang.org/COPYRIGHT.
39+
//
40+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
41+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
42+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
43+
// option. This file may not be copied, modified, or distributed
44+
// except according to those terms.
45+
46+
// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py'
47+
// on {date}.
48+
49+
50+
51+
"""
52+
53+
54+
def main():
55+
args = argparser().parse_args()
56+
out = open(os.path.abspath(args.output_file), "w")
57+
out.write(PRELUDE.format(date=str(datetime.datetime.now())))
58+
if args.crates_index:
59+
args.crates_index = os.path.abspath(args.crates_index)
60+
61+
# enter our scratch directory
62+
old_dir = os.getcwd()
63+
work_dir = tempfile.mkdtemp(prefix="scrape-crates-io")
64+
os.chdir(work_dir)
65+
66+
crates_index = (args.crates_index
67+
if os.path.join(old_dir, args.crates_index)
68+
else download_crates_index())
69+
70+
for (name, vers) in iter_crates(crates_index):
71+
if name in KNOWN_UNMAINTAINED_CRATES:
72+
continue
73+
74+
with Crate(work_dir, name, vers) as c:
75+
i = 0
76+
for line in c.iter_lines():
77+
for r in RE_REGEX.findall(line):
78+
print((name, vers, r))
79+
if len(r) >= 2 and r[-2] == "\\":
80+
continue
81+
out.write("// {}-{}: {}\n".format(name, vers, r))
82+
out.write("consistent!({}_{}, {});\n\n".format(
83+
name.replace("-", "_"), i, r))
84+
out.flush()
85+
i += 1
86+
87+
# Leave the scratch directory
88+
os.chdir(old_dir)
89+
shutil.rmtree(work_dir)
90+
out.close()
91+
92+
93+
def download_crates_index():
94+
if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0:
95+
print("Error cloning the crates.io index")
96+
exit(1)
97+
return "crates.io-index"
98+
99+
100+
def iter_crates(crates_index):
101+
exclude = set(["config.json", ".git"])
102+
for crate_index_file in iter_files(crates_index, exclude=exclude):
103+
with open(crate_index_file) as f:
104+
most_recent = list(f)
105+
most_recent = most_recent[len(most_recent) - 1]
106+
107+
crate_info = json.loads(most_recent)
108+
if "regex" not in set(d["name"] for d in crate_info["deps"]):
109+
continue
110+
111+
if crate_info["yanked"]:
112+
continue
113+
yield (crate_info["name"], crate_info["vers"])
114+
115+
116+
def iter_files(d, exclude=set()):
117+
for x in os.listdir(d):
118+
if x in exclude:
119+
continue
120+
121+
fullfp = os.path.abspath(d + "/" + x)
122+
if os.path.isfile(fullfp):
123+
yield fullfp
124+
elif os.path.isdir(fullfp):
125+
for f in iter_files(fullfp, exclude):
126+
yield f
127+
128+
129+
class Crate(object):
130+
def __init__(self, work_dir, name, version):
131+
self.name = name
132+
self.version = version
133+
self.url = ("https://crates.io/api/v1/crates/{name}/{version}/download"
134+
.format(name=self.name, version=self.version))
135+
self.filename = "{}/{}-{}.tar.gz".format(
136+
work_dir, self.name, self.version)
137+
138+
def __enter__(self):
139+
max_retries = 1
140+
retries = 0
141+
while retries < max_retries:
142+
retries += 1
143+
144+
r = http.request("GET", self.url, preload_content=False)
145+
try:
146+
print("[{}/{}] Downloading {}".format(
147+
retries, max_retries + 1, self.url))
148+
with open(self.filename, "wb") as f:
149+
while True:
150+
data = r.read(1024)
151+
if not data:
152+
break
153+
f.write(data)
154+
except requests.exceptions.ConnectionError:
155+
time.sleep(1)
156+
r.release_conn()
157+
continue
158+
159+
r.release_conn()
160+
break
161+
162+
call(["tar", "-xf", self.filename])
163+
164+
return self
165+
166+
def __exit__(self, ty, value, tb):
167+
# We are going to clean up the whole temp dir anyway, so
168+
# we don't really need to do this. Its nice to clean up
169+
# after ourselves though.
170+
try:
171+
shutil.rmtree(self.filename[:-len(".tar.gz")])
172+
os.remove(self.filename)
173+
except _:
174+
pass
175+
176+
def iter_srcs(self):
177+
g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")])
178+
for rsrc in glob.iglob(g):
179+
yield rsrc
180+
181+
def iter_lines(self):
182+
for src in self.iter_srcs():
183+
with open(src) as f:
184+
for line in f:
185+
yield line
186+
187+
188+
if __name__ == "__main__":
189+
main()

0 commit comments

Comments
 (0)