Skip to content

Commit 7eec64e

Browse files
author
Ethan Pailes
committed
Add random testing of crates.io regex
This patch adds some infastructure to scrape crates.io for regex, then run each of the regex found in this way though a random testing gauntlet to make sure that all the different backends behave in the same way. These random tests are expensive, so we add a feature gate around them so that normal CI does not get bogged down. If the CI tests are being run in a cron job, we trigger the random testing. To make this better we should add something which can generate a matching string from a regex. As is this will just focus on the negative case.
1 parent 2c7ae83 commit 7eec64e

File tree

6 files changed

+3506
-4
lines changed

6 files changed

+3506
-4
lines changed

Cargo.toml

+9
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ pattern = []
5757
# Enable to use simd acceleration.
5858
# Note that this is deprecated and is a no-op.
5959
simd-accel = []
60+
# When testing, run the expensive randomized testing suites in addition
61+
# to the fast unit tests.
62+
random-test = []
6063

6164
[lib]
6265
# There are no benchmarks in the library code itself
@@ -107,6 +110,12 @@ name = "backtrack-utf8bytes"
107110
path = "tests/test_backtrack_bytes.rs"
108111
name = "backtrack-bytes"
109112

113+
# Run all backends against each regex found on crates.io and make sure
114+
# that they all do the same thing.
115+
[[test]]
116+
path = "tests/test_crates_regex.rs"
117+
name = "crates-regex"
118+
110119
[profile.release]
111120
debug = true
112121

ci/script.sh

+11-4
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,18 @@ if [ "$TRAVIS_RUST_VERSION" = "1.12.0" ]; then
1717
exit
1818
fi
1919

20-
# Run tests. If we have nightly, then enable our nightly features.
20+
# Run tests.
21+
#
22+
# If we have nightly, then enable our nightly features.
23+
# If we are part of a nightly build, then run expensive random tests.
24+
CARGO_TEST_ARGS=""
25+
if [ "$TRAVIS_EVENT_TYPE" = "cron" ]; then
26+
CARGO_TEST_ARGS="${CARGO_TEST_ARGS} --features random-test"
27+
fi
2128
if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
22-
cargo test --verbose --features unstable
23-
else
24-
cargo test --verbose
29+
CARGO_TEST_ARGS="${CARGO_TEST_ARGS} --features unstable"
2530
fi
31+
cargo test --verbose ${CARGO_TEST_ARGS}
2632

2733
# Run a test that confirms the shootout benchmarks are correct.
2834
ci/run-shootout-test
@@ -44,3 +50,4 @@ if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
4450
(cd bench && ./run $x --no-run --verbose)
4551
done
4652
fi
53+

scripts/scrape_crates_io.py

+182
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
#!/usr/bin/env python3
2+
3+
import urllib3
4+
import glob
5+
import os
6+
import re
7+
import pdb
8+
import shutil
9+
import json
10+
import tempfile
11+
import argparse
12+
import datetime
13+
import time
14+
from subprocess import call
15+
16+
CRATES_IO_INDEX_GIT_LOC = "https://github.com/rust-lang/crates.io-index.git"
17+
RE_REGEX = re.compile(r"Regex::new\((r?\".*?\")\)")
18+
KNOWN_UNMAINTAINED_CRATES = set(["queryst-prime", "claude"])
19+
20+
# if only requests was in the standard library...
21+
urllib3.disable_warnings()
22+
http = urllib3.PoolManager()
23+
24+
def argparser():
25+
p = argparse.ArgumentParser("A script to scrape crates.io for regex.")
26+
p.add_argument("-c", "--crates-index", metavar="CRATES_INDEX_DIR",
27+
help=("A directory where we can find crates.io-index "
28+
+ "(if this isn't set it will be automatically "
29+
+ "downloaded)."))
30+
p.add_argument("-o", "--output-file", metavar="OUTPUT",
31+
default="crates_regex.rs",
32+
help="The name of the output file to create.")
33+
return p
34+
35+
PRELUDE = """// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
36+
// file at the top-level directory of this distribution and at
37+
// http://rust-lang.org/COPYRIGHT.
38+
//
39+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
40+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
41+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
42+
// option. This file may not be copied, modified, or distributed
43+
// except according to those terms.
44+
45+
// DO NOT EDIT. Automatically generated by 'scripts/scrape_crates_io.py'
46+
// on {date}.
47+
48+
49+
50+
"""
51+
52+
def main():
53+
args = argparser().parse_args()
54+
out = open(os.path.abspath(args.output_file), "w")
55+
out.write(PRELUDE.format(date=str(datetime.datetime.now())))
56+
args.crates_index = os.path.abspath(args.crates_index)
57+
58+
# enter our scratch directory
59+
old_dir = os.getcwd()
60+
work_dir = tempfile.mkdtemp(prefix="scrape-crates-io")
61+
os.chdir(work_dir)
62+
63+
crates_index = (args.crates_index
64+
if args.crates_index
65+
else download_crates_index())
66+
67+
for (name, vers) in iter_crates(crates_index):
68+
if name in KNOWN_UNMAINTAINED_CRATES:
69+
continue
70+
71+
with Crate(work_dir, name, vers) as c:
72+
i = 0
73+
for line in c.iter_lines():
74+
for r in RE_REGEX.findall(line):
75+
print((name, vers, r))
76+
if len(r) >= 2 and r[-2] == "\\":
77+
continue
78+
out.write("// {}-{}: {}\n".format(name, vers, r))
79+
out.write("consistent!({}_{}, {});\n\n".format(
80+
name.replace("-", "_"), i, r))
81+
out.flush()
82+
i += 1
83+
84+
# Leave the scratch directory
85+
os.chdir(old_dir)
86+
shutil.rmtree(work_dir)
87+
out.close()
88+
89+
def download_crates_index():
90+
if call(["git", "clone", CRATES_IO_INDEX_GIT_LOC]) != 0:
91+
print("Error cloning the crates.io index")
92+
exit(1)
93+
return "crates.io-index"
94+
95+
def iter_crates(crates_index):
96+
exclude = set(["config.json", ".git"])
97+
for crate_index_file in iter_files(crates_index, exclude=exclude):
98+
with open(crate_index_file) as f:
99+
most_recent = list(f)
100+
most_recent = most_recent[len(most_recent) - 1]
101+
102+
crate_info = json.loads(most_recent)
103+
if "regex" not in set(d["name"] for d in crate_info["deps"]):
104+
continue
105+
106+
if crate_info["yanked"]:
107+
continue
108+
yield (crate_info["name"], crate_info["vers"])
109+
110+
def iter_files(d, exclude=set()):
111+
for x in os.listdir(d):
112+
if x in exclude:
113+
continue
114+
115+
fullfp = os.path.abspath(d + "/" + x)
116+
if os.path.isfile(fullfp):
117+
yield fullfp
118+
elif os.path.isdir(fullfp):
119+
for f in iter_files(fullfp, exclude):
120+
yield f
121+
122+
123+
class Crate(object):
124+
def __init__(self, work_dir, name, version):
125+
self.name = name
126+
self.version = version
127+
self.url = "https://crates.io/api/v1/crates/{name}/{version}/download".format(
128+
name=self.name, version=self.version)
129+
self.filename = "{}/{}-{}.tar.gz".format(
130+
work_dir, self.name, self.version)
131+
132+
def __enter__(self):
133+
max_retries = 1
134+
retries = 0
135+
while retries < max_retries:
136+
retries += 1
137+
138+
r = http.request("GET", self.url, preload_content=False)
139+
try:
140+
print("[{}/{}] Downloading {}".format(
141+
retries, max_retries + 1, self.url))
142+
with open(self.filename, "wb") as f:
143+
while True:
144+
data = r.read(1024)
145+
if not data:
146+
break
147+
f.write(data)
148+
except requests.exceptions.ConnectionError:
149+
time.sleep(1)
150+
r.release_conn()
151+
continue
152+
153+
r.release_conn()
154+
break
155+
156+
call(["tar", "-xf", self.filename])
157+
158+
return self
159+
160+
def __exit__(self, ty, value, tb):
161+
# We are going to clean up the whole temp dir anyway, so
162+
# we don't really need to do this. Its nice to clean up
163+
# after ourselves though.
164+
try:
165+
shutil.rmtree(self.filename[:-len(".tar.gz")])
166+
os.remove(self.filename)
167+
except:
168+
pass
169+
170+
def iter_srcs(self):
171+
g = "{crate}/**/*.rs".format(crate=self.filename[:-len(".tar.gz")])
172+
for rsrc in glob.iglob(g):
173+
yield rsrc
174+
175+
def iter_lines(self):
176+
for src in self.iter_srcs():
177+
with open(src) as f:
178+
for line in f:
179+
yield line
180+
181+
if __name__ == "__main__":
182+
main()

0 commit comments

Comments
 (0)