From ff1f543a078415ae042140b8a98c51d3e2d0a5f5 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 25 Jan 2019 16:05:38 -0800 Subject: [PATCH 01/68] warcio test --- ...le-digest.warc => example-digest-bad.warc} | 0 test/test_archiveiterator.py | 10 +- warcio/archiveiterator.py | 5 +- warcio/cli.py | 11 + warcio/recordloader.py | 9 +- warcio/tester.py | 638 ++++++++++++++++++ 6 files changed, 663 insertions(+), 10 deletions(-) rename test/data/{example-digest.warc => example-digest-bad.warc} (100%) create mode 100644 warcio/tester.py diff --git a/test/data/example-digest.warc b/test/data/example-digest-bad.warc similarity index 100% rename from test/data/example-digest.warc rename to test/data/example-digest-bad.warc diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py index 8cba7600..810ba73b 100644 --- a/test/test_archiveiterator.py +++ b/test/test_archiveiterator.py @@ -185,6 +185,8 @@ def test_err_arc_iterator_on_warc(self): def test_corrects_wget_bug(self): with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record: assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' + with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record: + assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' def _digests_mutilate_helper(self, contents, expected_t, expected_f, capsys, full_read=False): with pytest.raises(ArchiveLoadFailed): @@ -243,9 +245,9 @@ def test_digests_file(self): expected_t = ['request', 'request', 'request'] # record 1: invalid payload digest - assert self._load_archive('example-digest.warc', check_digests=True) == expected_t - assert self._load_archive('example-digest.warc', check_digests=False) == expected_f + assert self._load_archive('example-digest-bad.warc', check_digests=True) == expected_t + assert self._load_archive('example-digest-bad.warc', check_digests=False) == expected_f # record 2: b64 digest; record 3: b64 filename safe digest - assert self._load_archive('example-digest.warc', offset=922, check_digests=True) == expected_t - assert self._load_archive('example-digest.warc', offset=922, check_digests=False) == expected_t + assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=True) == expected_t + assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=False) == expected_t diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py index 8f6a1b55..0d1fe2dd 100644 --- a/warcio/archiveiterator.py +++ b/warcio/archiveiterator.py @@ -43,12 +43,13 @@ class ArchiveIterator(six.Iterator): def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE, - check_digests=False): + check_digests=False, fixup_bugs=True): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, - arc2warc=arc2warc) + arc2warc=arc2warc, + fixup_bugs=fixup_bugs) self.known_format = None self.mixed_arc_warc = arc2warc diff --git a/warcio/cli.py b/warcio/cli.py index 85c4a750..7e40cdad 100644 --- a/warcio/cli.py +++ b/warcio/cli.py @@ -8,6 +8,7 @@ from warcio.indexer import Indexer from warcio.checker import Checker +from warcio.tester import Tester from warcio.utils import BUFF_SIZE import tempfile @@ -54,6 +55,10 @@ def main(args=None): check.add_argument('-v', '--verbose', action='store_true') check.set_defaults(func=checker) + test = subparsers.add_parser('test', help='WARC standards tester') + test.add_argument('inputs', nargs='+') + test.set_defaults(func=tester) + cmd = parser.parse_args(args=args) cmd.func(cmd) @@ -106,6 +111,12 @@ def checker(cmd): sys.exit(_checker.process_all()) +# ============================================================================ +def tester(cmd): + _tester = Tester(cmd) + sys.exit(_tester.process_all()) + + # ============================================================================ class Recompressor(object): def __call__(self, cmd): diff --git a/warcio/recordloader.py b/warcio/recordloader.py index 2467bde3..1f17d1f0 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -55,7 +55,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, verify_http=True, arc2warc=True): + def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -65,6 +65,7 @@ def __init__(self, verify_http=True, arc2warc=True): self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) + self.fixup_bugs = fixup_bugs def parse_record_stream(self, stream, statusline=None, @@ -96,7 +97,7 @@ def parse_record_stream(self, stream, elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') - uri = self._ensure_target_uri_format(rec_headers) + uri = self._ensure_target_uri_format(rec_headers, fixup_bugs=self.fixup_bugs) length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': @@ -235,7 +236,7 @@ def _detect_type_load_headers(self, stream, msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) - def _ensure_target_uri_format(self, rec_headers): + def _ensure_target_uri_format(self, rec_headers, fixup_bugs=True): """Checks the value for the WARC-Target-URI header field to see if it starts with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present, corrects and updates the field returning the corrected value for the field @@ -246,7 +247,7 @@ def _ensure_target_uri_format(self, rec_headers): :rtype: str | None """ uri = rec_headers.get_header('WARC-Target-URI') - if uri is not None and uri.startswith('<') and uri.endswith('>'): + if fixup_bugs and uri is not None and uri.startswith('<') and uri.endswith('>'): uri = uri[1:-1] rec_headers.replace_header('WARC-Target-URI', uri) return uri diff --git a/warcio/tester.py b/warcio/tester.py new file mode 100644 index 00000000..800f797e --- /dev/null +++ b/warcio/tester.py @@ -0,0 +1,638 @@ +from __future__ import print_function + +import re +import ipaddress +import sys +import traceback + +from warcio.archiveiterator import WARCIterator + + +class Commentary: + def __init__(self, record_id, rec_type): + self._record_id = record_id + self._rec_type = rec_type + self.errors = [] + self.recommendations = [] + self._comments = [] + + def record_id(self): + return self._record_id + + def rec_type(self): + return self._rec_type + + def error(self, *args): + self.errors.append(args) + + def recommendation(self, *args): + self.recommendations.append(args) + + def comment(self, *args): + self._comments.append(args) + + def has_comments(self): + if self.errors or self.recommendations or self._comments: + return True + + def comments(self): + for e in self.errors: + yield 'error: ' + ' '.join(e) + for r in self.recommendations: + yield 'recommendation: ' + ' '.join(r) + for c in self._comments: + yield 'comment: ' + ' '.join(c) + + +class WrapRecord(object): + def __init__(self, obj): + self.obj = obj + self._content = None + + def __getattr__(self, name): + if name == 'content': + if self._content is None: + self._content = self.obj.content_stream().read() + return self._content + return getattr(self.__dict__['obj'], name) + + +def canon_content_type(s): + return s.lower().replace('; ', ';') + + +def validate_warc_fields(record, commentary): + # warc-fields = *named-field CRLF + # named-field = field-name ":" [ field-value ] + # field-value = *( field-content | LWS ) # LWS signals continuations + # field-name = token # token_re + + content = record.content + try: + text = content.decode('utf-8', errors='strict') + except UnicodeDecodeError as e: + commentary.error('warc-fields contains invalid utf-8: '+str(e)) + text = content.decode('utf-8', errors='replace') + + first_line = True + lines = [] + for line in text.splitlines(True): + if not line.endswith('\r\n'): + commentary.error('warc-fields lines must end with \r\n') + line = line.rstrip('\r\n') + else: + line = line[:-2] + + if line.startswith(' ') or line.startswith('\t'): + if first_line: + commentary.error('The first line of warc-fields cannot start with whitespace') + else: + lines[-1] += ' ' + line[1:] + elif line == '': + # are blank lines prohibited? + pass + else: + # check for field-name : + if ':' not in line: + commentary.error('Missing field-name : in warc-fields line', line) + else: + field_name = line.split(':', 1)[0] + if not re.fullmatch(token_re, field_name): + commentary('invalid warc-fields name', field_name) + else: + lines.append(line) + first_line = False + + # check known fields + + +def validate_warcinfo(record, commentary, pending): + content_type = record.rec_headers.get_header('Content-Type') + if content_type.lower() != 'application/warc-fields': + commentary.recommencation('warcinfo Content-Type of application/warc-fields, saw', content_type) + else: + # format: warc-fields + # allowable fields include but not limited to DMCI plus the following + # operator, software, robots, hostname, ip, http-header-user-agent, http-header-from + # if operator present, recommended name or name and email address + # comment if http-user-agent here and in the request or metadata record? + # comment if http-header-from here and in the request? + validate_warc_fields(record, commentary) + + # whole-file tests: + # optional that warcinfo be first in file, still deserves a comment + # allowable for warcinfo to appear anywhere + + +def validate_response(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + + if target_uri.startswith('http:') or target_uri.startswith('https:'): + content_type = record.rec_headers.get_header('Content-Type') + if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}: + commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw ', content_type) + + if record.rec_headers.get_header('WARC-IP-Address') is None: + commentary.error('WARC-IP-Address should be used for http and https responses') + + # error: http and https schemes should have http response headers + # comment: verify http content-length, if present -- commoncrawl nutch bug + + +def validate_resource(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + + if target_uri.startswith('dns:'): + content_type = record.rec_headers.get_header('Content-Type') + if content_type.lower() != 'text/dns': + commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type) + else: + # rfc 2540 and rfc 1035 + #validate_text_dns() + pass + + # should never have http headers + + +def validate_request(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + + if target_uri.startswith('http:') or target_uri.startswith('https:'): + content_type = record.rec_headers.get_header('Content-Type') + + if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}: + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw ', content_type) + + if record.rec_headers.get_header('WARC-IP-Address') is None: + commentary.error('WARC-IP-Address should be used for http and https requests') + + # error: http and https schemes should have http request headers + + # WARC-Concurrent-To field or fields may be used, comment if present but target record is not + + +def validate_metadata(record, commentary, pending): + content_type = record.rec_headers.get_header('Content-Type') + if content_type.lower() == 'application/warc-fields': + # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 + # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it + # hopsFromSeed: string + # fetchTimeMs: time in milliseconds, so it's an integer? + validate_warc_fields(record, commentary) + + +def validate_revisit(record, commentary, pending): + warc_profile = record.rec_headers.get_header('WARC-Profile') + + if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): + config = { + 'required': ['WARC-Payload-Digest'], + 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], + } + validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) + # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired + # recommended that server response headers be preserved "in this manner" + + elif warc_profile.ends_with('/revisit/server-not-modified'): + config = { + 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'], + 'prohibited': ['WARC-Payload-Digest'], + } + validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) + # may have content body; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated if desired + # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present + else: + commentary.comment('no revisit details validation done due to unknown profile') + + +def validate_conversion(record, commentary, pending): + # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment? + # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To + pass + + +def validate_continuation(record, commentary, pending): + commentary.comment('warcio test continuation code has not been tested, expect bugs') + + warc_type = record.rec_headers.get_header('WARC-Type') + if warc_type in {'warcinfo', 'request', 'metadata', 'revisit'}: + commentary.recommendation('do not segment warc-type', warc_type) + + # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated + + +def validate_actual_uri(field, value, record, version, commentary, pending): + # uri per RFC 3986 + # should use a registered scheme + # %XX encoding, normalize to upper case + # schemes are case-insensitive and normalize to lower + if value.startswith('<') or value.endswith('>'): + # wget 1.19 bug caused by WARC 1.0 spec error + commentary.error('uri must not be within <>', field, value) + if ':' not in value: + commentary.error('invalid uri, no scheme', field, value) + if re.search(r'\s', value, re.A): + commentary.error('invalid uri, contains whitespace', field, value) + scheme, rest = value.split(':', 1) + if not re.fullmatch(r'[A-Za-z][A-Za-z0-9+\-\.]*', scheme, re.A): + commentary.error('invalid uri scheme, bad character', field, value) + # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + + +def validate_warc_type(field, value, record, version, commentary, pending): + if not value.islower(): + # I am unclear if this is allowed? standard is silent + commentary.comment('Warc-Type is not lower-case', field, value) + if value.lower() not in record_types: + # standard says readers should ignore unknown warc-types + commentary.comment('unknown Warc-Type', field, value) + + +def validate_uri(field, value, record, version, commentary, pending): + # < uri > + if not (value.startswith('<') and value.endswith('>')): + commentary.error('uri must be within <>', field, value) + return + validate_actual_uri(field, value[1:-1], record, version, commentary, pending) + + +def validate_record_id(field, value, record, version, commentary, pending): + validate_uri(field, value, record, version, commentary, pending) + # TODO: should be "globally unique for its period of intended use" + + +def validate_timestamp(field, value, record, version, commentary, pending): + use_ms = False if version == '1.0' else True + if not use_ms: + if '.' in value: + # XXX specification infelicity: would be nice to have 'advice to implementers' here + commentary.error('WARC 1.0 may not have fractional seconds', field, value) + else: + start, end = value.split('.', 1) + if not re.fullmatch(r'[0-9]{1,9}Z', end, re.A): + commentary.error('fractional seconds must have 1-9 digits', field, value) + + # XXX the above is pretty incomplete for dash, colon, trailing Z, etc + + # TODO: "multiple records written as part of a single capture event shall use the same WARC-Date" + # how? follow WARC-Concurrent-To pointer(s) from request to response(s) + + +def validate_content_length(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('must be an integer', field, value) + + +token_re = r'[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+' +digest_re = r'[A-Za-z0-9/+\-_=]+' + + +def validate_content_type(field, value, record, version, commentary, pending): + if '/' not in value: + commentary.error('must contain a /', field, value) + ctype, rest = value.split('/', 1) + if not re.fullmatch(token_re, ctype, re.A): + commentary.error('invalid type', field, value) + if ';' in rest: + subtype, rest = rest.split(';', 1) + else: + subtype = rest + if not re.fullmatch(token_re, subtype, re.A): + commentary.error('invalid subtype', field, value) + # at this point there can be multiple parameters, + # some of which could have quoted string values with ; in them + # TODO: more checking + + +def validate_digest(field, value, record, version, commentary, pending): + if ':' not in value: + commentary.error('missing algorithm', field, value) + algorithm, digest = value.split(':', 1) + if not re.fullmatch(token_re, algorithm, re.A): + commentary.error('invalid algorithm', field, value) + if not re.fullmatch(token_re, digest, re.A): + # https://github.com/iipc/warc-specifications/issues/48 + # commentary.comment('spec incorrectly says this is an invalid digest', field, value) + pass + if not re.fullmatch(digest_re, digest, re.A): + commentary.comment('Invalid-looking digest value', field, value) + + +def validate_ip(field, value, record, version, commentary, pending): + # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291 + try: + ipaddress.ip_address(value) + except ValueError: + commentary.error('invalid ip', field, value) + + +def validate_truncated(field, value, record, version, commentary, pending): + if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: + commentary.comment('extension seen', field, value) + + +def validate_warcinfo_id(field, value, record, version, commentary, pending): + validate_uri(field, value, record, version, commentary, pending) + # TODO: should point at a warcinfo record + + +def validate_filename(field, value, record, version, commentary, pending): + # TODO: text or quoted-string + pass + + +profiles = { + '1.0': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.1/revisit/server-not-modified', + # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not? + 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'], + '1.1': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.0/revisit/server-not-modified'], +} + + +def validate_profile(field, value, record, version, commentary, pending): + if version not in profiles: + commentary.comment('no profile check because unknown warc version', field, value) + return + if value not in profiles[version]: + commentary.comment('extension seen', field, value) + + +def validate_segment_number(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('must be an integer', field, value) + iv = int(value) + if iv == 0: + commentary.error('must be 1 or greater', field, value) + # TODO: type != continuation must have iv == 1, else iv > 1 + # might make that check in the 'continuation' section? + + +def validate_segment_total_length(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('must be an integer', field, value) + + +warc_fields = { + 'WARC-Type': { + 'validate': validate_warc_type, + }, + 'WARC-Record-ID': { + 'validate': validate_record_id, + }, + 'WARC-Date': { + 'validate': validate_timestamp, + }, + 'Content-Length': { + 'validate': validate_content_length, + }, + 'Content-Type': { + 'validate': validate_content_type, + }, + 'WARC-Concurrent-To': { + 'validate': validate_uri, + }, + 'WARC-Block-Digest': { + 'validate': validate_digest, # openssl check? or just let check_digest get it? + }, + 'WARC-Payload-Digest': { + 'validate': validate_digest, + }, + 'WARC-IP-Address': { + 'validate': validate_ip, + }, + 'WARC-Refers-To': { + 'validate': validate_uri, + }, + 'WARC-Target-URI': { + 'validate': validate_actual_uri, + }, + 'WARC-Truncated': { + 'validate': validate_truncated, + }, + 'WARC-Warcinfo-ID': { + 'validate': validate_warcinfo_id, + }, + 'WARC-Filename': { + 'validate': validate_filename, + }, + 'WARC-Profile': { + 'validate': validate_profile, + }, + 'WARC-Identified-Payload-Type': { + 'validate': validate_content_type, + }, + 'WARC-Segment-Origin-ID': { + 'validate': validate_uri, + }, + 'WARC-Segment-Number': { + 'validate': validate_segment_number, + }, + 'WARC-Segment-Total-Length': { + 'validate': validate_segment_total_length, + }, + 'WARC-Refers-To-Target-URI': { + 'validate': validate_actual_uri, + 'minver': '1.1', + }, + 'WARC-Refers-To-Date': { + 'validate': validate_timestamp, + 'minver': '1.1', + }, +} +warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()]) + +record_types = { + 'warcinfo': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'], + 'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'], + 'validate': validate_warcinfo, + }, + 'response': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_response, + }, + 'resource': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + }, + 'request': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_request, + }, + 'metadata': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'], + 'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_metadata, + }, + 'revisit': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI', 'WARC-Profile'], + 'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID', # normal optionals + 'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], # these are for profiles + 'prohibited': ['WARC-Filename'], + 'validate': validate_revisit, + }, + 'conversion': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID'], + 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_conversion, + }, + 'continuation': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'], + 'optional': [], + 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_continuation, + }, +} + + +def make_header_set(config, kinds): + ret = set() + for kind in kinds: + ret = ret.union(set([x.lower() for x in config.get(kind, [])])) + return ret + + +def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): + for req in config.get('required', []): + if not rec_headers.get_header(req): + commentary.error('missing required header', req) + for rec in config.get('recommended', []): + if not rec_headers.get_header(rec): + commentary.recommendation('missing recommended header', rec) + allowed = make_header_set(config, ('required', 'optional', 'recommended')) + prohibited = make_header_set(config, ('prohibited',)) + + for field, value in rec_headers.headers: + fl = field.lower() + if fl in prohibited: + commentary.error('field not allowed in record_type', field, rec_type) + elif allow_all or fl in allowed: + pass + elif fl in warc_fields: + commentary.comment('no configuration seen for', field, rec_type) + else: + # an 'unknown field' comment has already been issued in validate_record + pass + + +def validate_record_against_rec_type(config, record, commentary, pending): + if 'validate' in config: + config['validate'](record, commentary, pending) + + +def validate_record(record): + version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported? + + record_id = record.rec_headers.get_header('WARC-Record-ID') + rec_type = record.rec_headers.get_header('WARC-Type') + if record_id is None: + print('no WARC-Record-ID seen, skipping validation', file=sys.stderr) + return + commentary = Commentary(record_id, rec_type) + pending = None + + seen_fields = set() + for field, value in record.rec_headers.headers: + field_case = field + field = field.lower() + if field != 'warc-concurrent-to' and field in seen_fields: + commentary.error('duplicate field seen', field, value) + if field not in warc_fields: + commentary.comment('unknown field, no validation performed', field_case, value) + continue + config = warc_fields[field] + if 'minver' in config: + if version < config['minver']: + # unknown fields are extensions, so this is a comment and not an error + commentary.comment('field was introduced after this warc version', field_case, value, version) + if 'validate' in config: + config['validate'](field, value, record, version, commentary, pending) + + # TODO: validate warc types: unknown should get a comment + if rec_type not in record_types: + commentary.comment('unknown record type, no validation performed', rec_type) + else: + validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary) + validate_record_against_rec_type(record_types[rec_type], record, commentary, pending) + + return commentary + + +def _process_one(warc): + if warc.endswith('.arc') or warc.endswith('.arc.gz'): + return + with open(warc, 'rb') as stream: + for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): + + try: + record = WrapRecord(record) + digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or + record.rec_headers.get_header('WARC-Block-Digest')) + + commentary = validate_record(record) + + record.content # make sure digests are checked + # XXX might need to read and digest the raw stream to check digests for chunked encoding? + # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes + except Exception: + # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code + print('Caught exception in warcio test analysis code') + traceback.print_exc() + exit(1) + + if commentary.has_comments() or record.digest_checker.passed is False: + print(' ', 'WARC-Record-ID', commentary.record_id()) + print(' ', 'WARC-Type', commentary.rec_type()) + + if record.digest_checker.passed is True: + print(' digest pass') + elif record.digest_checker.passed is None: + if digest_present: + print(' digest present but not checked') + else: + print(' digest not present') + for p in record.digest_checker.problems: + print(' ', p) + + if commentary.has_comments(): + for c in commentary.comments(): + print(' ', c) + + +class Tester(object): + def __init__(self, cmd): + self.inputs = cmd.inputs + self.verbose = cmd.verbose + self.exit_value = 0 + + def process_all(self): + for warc in self.inputs: + print(warc) + try: + self.process_one(warc) + except Exception as e: + print(' saw exception '+str(e).rstrip(), file=sys.stderr) + print(' skipping rest of file', file=sys.stderr) + return self.exit_value + + def process_one(self, filename): + _process_one(filename) From ebb721f768e7b0e4c98c93820fcb6743c3c96025 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 25 Jan 2019 16:12:21 -0800 Subject: [PATCH 02/68] documentation --- README.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.rst b/README.rst index d4990f06..513f97d8 100644 --- a/README.rst +++ b/README.rst @@ -368,6 +368,14 @@ of WARC records, if possible. An exit value of 1 indicates a failure. ``warcio check -v`` will print verbose output for each record in the WARC file. +Test +~~~~ + +The ``warcio test`` command will check one or more WARC files against +the WARC standard, giving commentary about standards violations, +recommendations, and other issues. + + Recompress ~~~~~~~~~~ From 7aa060db1aa7c8d53c8c33fd3f716f0e1686ddf1 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 25 Jan 2019 16:42:58 -0800 Subject: [PATCH 03/68] tests --- test/test_archiveiterator.py | 2 +- test/test_cli.py | 2 +- warcio/tester.py | 5 +++-- warcio/utils.py | 6 +++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py index 810ba73b..2015c63b 100644 --- a/test/test_archiveiterator.py +++ b/test/test_archiveiterator.py @@ -186,7 +186,7 @@ def test_corrects_wget_bug(self): with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record: assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record: - assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' + assert record.rec_headers.get('WARC-Target-URI') == '' def _digests_mutilate_helper(self, contents, expected_t, expected_f, capsys, full_read=False): with pytest.raises(ArchiveLoadFailed): diff --git a/test/test_cli.py b/test/test_cli.py index dc643ec4..4aaa96fd 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -90,7 +90,7 @@ def test_check_valid(): def test_check_invalid(): - filenames = [get_test_file('example-digest.warc')] + filenames = [get_test_file('example-digest-bad.warc')] args = ['check'] + filenames value = check_helper(args, 1) diff --git a/warcio/tester.py b/warcio/tester.py index 800f797e..de456dc8 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -6,6 +6,7 @@ import traceback from warcio.archiveiterator import WARCIterator +from warcio.utils import to_native_str class Commentary: @@ -69,10 +70,10 @@ def validate_warc_fields(record, commentary): content = record.content try: - text = content.decode('utf-8', errors='strict') + text = to_native_str(content, 'utf-8', errors='strict') except UnicodeDecodeError as e: commentary.error('warc-fields contains invalid utf-8: '+str(e)) - text = content.decode('utf-8', errors='replace') + text = to_native_str(content, 'utf-8', errors='replace') first_line = True lines = [] diff --git a/warcio/utils.py b/warcio/utils.py index 23050548..6fd8a92f 100644 --- a/warcio/utils.py +++ b/warcio/utils.py @@ -13,14 +13,14 @@ # #=========================================================================== -def to_native_str(value, encoding='utf-8'): +def to_native_str(value, encoding='utf-8', errors='strict'): if isinstance(value, str): return value if six.PY3 and isinstance(value, six.binary_type): #pragma: no cover - return value.decode(encoding) + return value.decode(encoding, errors) elif six.PY2 and isinstance(value, six.text_type): #pragma: no cover - return value.encode(encoding) + return value.encode(encoding, errors) else: return value From 24f300055ceaa32120e51c0a29d73a6f573b1e7d Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 25 Jan 2019 17:03:04 -0800 Subject: [PATCH 04/68] tests --- warcio/tester.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index de456dc8..386586bb 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -1,7 +1,6 @@ from __future__ import print_function import re -import ipaddress import sys import traceback @@ -9,6 +8,14 @@ from warcio.utils import to_native_str +def try_ipaddress_init(): + # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies. + try: + import ipaddress + except ImportError: # pragma: no cover + pass + + class Commentary: def __init__(self, record_id, rec_type): self._record_id = record_id @@ -325,6 +332,8 @@ def validate_ip(field, value, record, version, commentary, pending): ipaddress.ip_address(value) except ValueError: commentary.error('invalid ip', field, value) + except NameError: + commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') def validate_truncated(field, value, record, version, commentary, pending): @@ -622,8 +631,8 @@ def _process_one(warc): class Tester(object): def __init__(self, cmd): self.inputs = cmd.inputs - self.verbose = cmd.verbose self.exit_value = 0 + try_ipaddress_init() def process_all(self): for warc in self.inputs: From 40f9fc66292bad2773a33a4c347f5e24280f9ad4 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 26 Jan 2019 08:39:22 -0800 Subject: [PATCH 05/68] coverage --- test/data/standard-torture-missing.warc | 5 + .../standard-torture-validate-record.warc | 79 ++++++++++ test/test_tests.py | 149 ++++++++++++++++++ warcio/tester.py | 79 ++++++---- 4 files changed, 278 insertions(+), 34 deletions(-) create mode 100644 test/data/standard-torture-missing.warc create mode 100644 test/data/standard-torture-validate-record.warc create mode 100644 test/test_tests.py diff --git a/test/data/standard-torture-missing.warc b/test/data/standard-torture-missing.warc new file mode 100644 index 00000000..a1ab0714 --- /dev/null +++ b/test/data/standard-torture-missing.warc @@ -0,0 +1,5 @@ +WARC/1.0 +WARC-Type: warcinfo +Content-Length: 0 + + diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc new file mode 100644 index 00000000..5181ea38 --- /dev/null +++ b/test/data/standard-torture-validate-record.warc @@ -0,0 +1,79 @@ +WARC/1.0 +WARC-Type: warcinfo +Content-Type: application/warc-fields +Content-Length: 146 + + first line can't start with a space +test: invalid utf8 �( +test: lines should end with \r\n +foo: + bar + +no colon +token cannot have a space: + + +WARC/1.0 +WARC-Type: warcinfo +Content-Type: application/warc-fields +Content-Length: 0 + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: HtTp://example.com/ +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: resource +WARC-Target-URI: DnS:asdfasdf +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: resource +WARC-Target-URI: DnS:asdfasdf +Content-Type: text/dns +Content-Length: 0 + + +WARC/1.0 +WARC-Type: request +WARC-Target-URI: hTtP://example.com/ +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: metadata +Content-Type: application/warc-fields +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Profile: none +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified +Content-Length: 0 + + +WARC/1.0 +WARC-Type: continuation +WARC-Segment-Number: 1 +Content-Length: 0 + + diff --git a/test/test_tests.py b/test/test_tests.py new file mode 100644 index 00000000..239d2461 --- /dev/null +++ b/test/test_tests.py @@ -0,0 +1,149 @@ +from warcio.cli import main + +from . import get_test_file +from .test_cli import patch_stdout + + +def helper(args, expected_exit_value): + with patch_stdout() as buff: + exit_value = None + try: + main(args=args) + except SystemExit as e: + exit_value = e.code + finally: + assert exit_value == expected_exit_value + + return buff.getvalue() + + +def remove_before_test_data(s): + ret = b'' + for line in s.splitlines(True): + if b'/test/data/' in line: + line = b'test/data/' + line.split(b'/test/data/', 1)[1] + ret += line + return ret + + +def test_torture_missing(): + files = ['standard-torture-missing.warc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = b"""\ +test/data/standard-torture-missing.warc + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + recommendation: warcinfo Content-Type of application/warc-fields, saw none +""" + + value = helper(args, 0) + assert remove_before_test_data(value) == expected + + +def test_torture_validate_record(): + files = ['standard-torture-validate-record.warc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = b"""\ +test/data/standard-torture-validate-record.warc + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte + comment: The first line of warc-fields cannot start with whitespace + comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n + comment: Missing field-name : in warc-fields line: no colon + comment: invalid warc-fields name: token cannot have a space + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + comment: warc-fields body present but empty + WARC-Record-ID None + WARC-Type response + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain + error: WARC-IP-Address should be used for http and https responses + WARC-Record-ID None + WARC-Type resource + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + WARC-Record-ID None + WARC-Type resource + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + WARC-Record-ID None + WARC-Type request + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID None + WARC-Type metadata + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + comment: warc-fields body present but empty + WARC-Record-ID None + WARC-Type revisit + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI + comment: extension seen warc-profile none + comment: no revisit details validation done due to unknown profile + WARC-Record-ID None + WARC-Type revisit + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI + error: missing required header WARC-Payload-Digest + recommendation: missing recommended header WARC-Refers-To + recommendation: missing recommended header WARC-Refers-To-Date + recommendation: missing recommended header WARC-Refers-To-Target-URI + WARC-Record-ID None + WARC-Type revisit + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI + recommendation: missing recommended header WARC-Refers-To + recommendation: missing recommended header WARC-Refers-To-Date + comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified + WARC-Record-ID None + WARC-Type continuation + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Segment-Origin-ID + error: missing required header WARC-Target-URI + error: continuation record must have WARC-Segment-Number > 1, saw 1 + comment: warcio test continuation code has not been tested, expect bugs +""" + + value = helper(args, 0) + print(remove_before_test_data(value).decode()) + assert remove_before_test_data(value) == expected diff --git a/warcio/tester.py b/warcio/tester.py index 386586bb..bdfe38f0 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -75,10 +75,10 @@ def validate_warc_fields(record, commentary): # field-value = *( field-content | LWS ) # LWS signals continuations # field-name = token # token_re - content = record.content + content = record.content # TESTME try: text = to_native_str(content, 'utf-8', errors='strict') - except UnicodeDecodeError as e: + except UnicodeDecodeError as e: # TESTME commentary.error('warc-fields contains invalid utf-8: '+str(e)) text = to_native_str(content, 'utf-8', errors='replace') @@ -86,14 +86,14 @@ def validate_warc_fields(record, commentary): lines = [] for line in text.splitlines(True): if not line.endswith('\r\n'): - commentary.error('warc-fields lines must end with \r\n') + commentary.comment('warc-fields lines must end with \\r\\n:', line.rstrip()) line = line.rstrip('\r\n') else: line = line[:-2] if line.startswith(' ') or line.startswith('\t'): if first_line: - commentary.error('The first line of warc-fields cannot start with whitespace') + commentary.comment('The first line of warc-fields cannot start with whitespace') else: lines[-1] += ' ' + line[1:] elif line == '': @@ -102,22 +102,26 @@ def validate_warc_fields(record, commentary): else: # check for field-name : if ':' not in line: - commentary.error('Missing field-name : in warc-fields line', line) + commentary.comment('Missing field-name : in warc-fields line:', line) else: field_name = line.split(':', 1)[0] if not re.fullmatch(token_re, field_name): - commentary('invalid warc-fields name', field_name) + commentary.comment('invalid warc-fields name:', field_name) else: lines.append(line) first_line = False + if not lines: + commentary.comment('warc-fields body present but empty') + return + # check known fields def validate_warcinfo(record, commentary, pending): - content_type = record.rec_headers.get_header('Content-Type') + content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': - commentary.recommencation('warcinfo Content-Type of application/warc-fields, saw', content_type) + commentary.recommendation('warcinfo Content-Type of application/warc-fields, saw', content_type) else: # format: warc-fields # allowable fields include but not limited to DMCI plus the following @@ -133,25 +137,27 @@ def validate_warcinfo(record, commentary, pending): def validate_response(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() # TESTME if target_uri.startswith('http:') or target_uri.startswith('https:'): - content_type = record.rec_headers.get_header('Content-Type') + content_type = record.rec_headers.get_header('Content-Type', 'none') if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}: - commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw ', content_type) + commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') # error: http and https schemes should have http response headers + # test by attempting to parse them? + # comment: verify http content-length, if present -- commoncrawl nutch bug def validate_resource(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower() # TESTME if target_uri.startswith('dns:'): - content_type = record.rec_headers.get_header('Content-Type') + content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'text/dns': commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type) else: @@ -163,13 +169,13 @@ def validate_resource(record, commentary, pending): def validate_request(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() # TESTME if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type') if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}: - commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw ', content_type) + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https requests') @@ -180,7 +186,7 @@ def validate_request(record, commentary, pending): def validate_metadata(record, commentary, pending): - content_type = record.rec_headers.get_header('Content-Type') + content_type = record.rec_headers.get_header('Content-Type', 'none') # TESTME if content_type.lower() == 'application/warc-fields': # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it @@ -190,7 +196,7 @@ def validate_metadata(record, commentary, pending): def validate_revisit(record, commentary, pending): - warc_profile = record.rec_headers.get_header('WARC-Profile') + warc_profile = record.rec_headers.get_header('WARC-Profile', 'none') # TESTME if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): config = { @@ -201,7 +207,7 @@ def validate_revisit(record, commentary, pending): # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired # recommended that server response headers be preserved "in this manner" - elif warc_profile.ends_with('/revisit/server-not-modified'): + elif warc_profile.endswith('/revisit/server-not-modified'): config = { 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'], 'prohibited': ['WARC-Payload-Digest'], @@ -216,15 +222,15 @@ def validate_revisit(record, commentary, pending): def validate_conversion(record, commentary, pending): # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment? # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To - pass + pass # TESTME def validate_continuation(record, commentary, pending): - commentary.comment('warcio test continuation code has not been tested, expect bugs') + commentary.comment('warcio test continuation code has not been tested, expect bugs') # TESTME - warc_type = record.rec_headers.get_header('WARC-Type') - if warc_type in {'warcinfo', 'request', 'metadata', 'revisit'}: - commentary.recommendation('do not segment warc-type', warc_type) + segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') + if segment_number.isdigit() and int(segment_number) < 2: + commentary.error('continuation record must have WARC-Segment-Number > 1, saw', segment_number) # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated @@ -234,7 +240,7 @@ def validate_actual_uri(field, value, record, version, commentary, pending): # should use a registered scheme # %XX encoding, normalize to upper case # schemes are case-insensitive and normalize to lower - if value.startswith('<') or value.endswith('>'): + if value.startswith('<') or value.endswith('>'): # TESTME # wget 1.19 bug caused by WARC 1.0 spec error commentary.error('uri must not be within <>', field, value) if ':' not in value: @@ -250,10 +256,10 @@ def validate_actual_uri(field, value, record, version, commentary, pending): def validate_warc_type(field, value, record, version, commentary, pending): if not value.islower(): # I am unclear if this is allowed? standard is silent - commentary.comment('Warc-Type is not lower-case', field, value) + commentary.comment('WARC-Type is not lower-case', field, value) if value.lower() not in record_types: # standard says readers should ignore unknown warc-types - commentary.comment('unknown Warc-Type', field, value) + commentary.comment('unknown WARC-Type', field, value) def validate_uri(field, value, record, version, commentary, pending): @@ -307,8 +313,10 @@ def validate_content_type(field, value, record, version, commentary, pending): subtype = rest if not re.fullmatch(token_re, subtype, re.A): commentary.error('invalid subtype', field, value) + # at this point there can be multiple parameters, # some of which could have quoted string values with ; in them + # TODO: more checking @@ -372,11 +380,17 @@ def validate_profile(field, value, record, version, commentary, pending): def validate_segment_number(field, value, record, version, commentary, pending): if not value.isdigit(): commentary.error('must be an integer', field, value) + return iv = int(value) if iv == 0: commentary.error('must be 1 or greater', field, value) - # TODO: type != continuation must have iv == 1, else iv > 1 - # might make that check in the 'continuation' section? + + rec_type = record.rec_headers.get_header('WARC-Type', 'none') + if rec_type != 'continuation': + if iv != 1: + commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value) + elif rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: + commentary.recommendation('do not segment warc-type', warc_type) def validate_segment_total_length(field, value, record, version, commentary, pending): @@ -507,7 +521,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'continuation': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'], - 'optional': [], + 'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'], 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_continuation, }, @@ -522,10 +536,10 @@ def make_header_set(config, kinds): def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): - for req in config.get('required', []): + for req in sorted(config.get('required', [])): if not rec_headers.get_header(req): commentary.error('missing required header', req) - for rec in config.get('recommended', []): + for rec in sorted(config.get('recommended', [])): if not rec_headers.get_header(rec): commentary.recommendation('missing recommended header', rec) allowed = make_header_set(config, ('required', 'optional', 'recommended')) @@ -554,9 +568,6 @@ def validate_record(record): record_id = record.rec_headers.get_header('WARC-Record-ID') rec_type = record.rec_headers.get_header('WARC-Type') - if record_id is None: - print('no WARC-Record-ID seen, skipping validation', file=sys.stderr) - return commentary = Commentary(record_id, rec_type) pending = None From c70e68eee640a86a9711223d8350a00f6301445a Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 26 Jan 2019 08:46:10 -0800 Subject: [PATCH 06/68] python 2.7 test fix --- warcio/tester.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index bdfe38f0..b74a3b03 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -105,7 +105,7 @@ def validate_warc_fields(record, commentary): commentary.comment('Missing field-name : in warc-fields line:', line) else: field_name = line.split(':', 1)[0] - if not re.fullmatch(token_re, field_name): + if not re.search(token_re, field_name): commentary.comment('invalid warc-fields name:', field_name) else: lines.append(line) @@ -248,7 +248,7 @@ def validate_actual_uri(field, value, record, version, commentary, pending): if re.search(r'\s', value, re.A): commentary.error('invalid uri, contains whitespace', field, value) scheme, rest = value.split(':', 1) - if not re.fullmatch(r'[A-Za-z][A-Za-z0-9+\-\.]*', scheme, re.A): + if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme, re.A): commentary.error('invalid uri scheme, bad character', field, value) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml @@ -283,7 +283,7 @@ def validate_timestamp(field, value, record, version, commentary, pending): commentary.error('WARC 1.0 may not have fractional seconds', field, value) else: start, end = value.split('.', 1) - if not re.fullmatch(r'[0-9]{1,9}Z', end, re.A): + if not re.search(r'\A[0-9]{1,9}Z\Z', end, re.A): commentary.error('fractional seconds must have 1-9 digits', field, value) # XXX the above is pretty incomplete for dash, colon, trailing Z, etc @@ -297,21 +297,21 @@ def validate_content_length(field, value, record, version, commentary, pending): commentary.error('must be an integer', field, value) -token_re = r'[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+' -digest_re = r'[A-Za-z0-9/+\-_=]+' +token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z' +digest_re = r'\A[A-Za-z0-9/+\-_=]+\Z' def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: commentary.error('must contain a /', field, value) ctype, rest = value.split('/', 1) - if not re.fullmatch(token_re, ctype, re.A): + if not re.search(token_re, ctype, re.A): commentary.error('invalid type', field, value) if ';' in rest: subtype, rest = rest.split(';', 1) else: subtype = rest - if not re.fullmatch(token_re, subtype, re.A): + if not re.search(token_re, subtype, re.A): commentary.error('invalid subtype', field, value) # at this point there can be multiple parameters, @@ -324,13 +324,13 @@ def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: commentary.error('missing algorithm', field, value) algorithm, digest = value.split(':', 1) - if not re.fullmatch(token_re, algorithm, re.A): + if not re.search(token_re, algorithm, re.A): commentary.error('invalid algorithm', field, value) - if not re.fullmatch(token_re, digest, re.A): + if not re.search(token_re, digest, re.A): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) pass - if not re.fullmatch(digest_re, digest, re.A): + if not re.search(digest_re, digest, re.A): commentary.comment('Invalid-looking digest value', field, value) From 1847633cae2e221940e314c28036a0c4bdb4323b Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 26 Jan 2019 08:51:08 -0800 Subject: [PATCH 07/68] python 2.7 fixes --- warcio/tester.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index b74a3b03..c978a404 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -245,10 +245,10 @@ def validate_actual_uri(field, value, record, version, commentary, pending): commentary.error('uri must not be within <>', field, value) if ':' not in value: commentary.error('invalid uri, no scheme', field, value) - if re.search(r'\s', value, re.A): + if re.search(r'\s', value): commentary.error('invalid uri, contains whitespace', field, value) scheme, rest = value.split(':', 1) - if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme, re.A): + if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): commentary.error('invalid uri scheme, bad character', field, value) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml @@ -283,7 +283,7 @@ def validate_timestamp(field, value, record, version, commentary, pending): commentary.error('WARC 1.0 may not have fractional seconds', field, value) else: start, end = value.split('.', 1) - if not re.search(r'\A[0-9]{1,9}Z\Z', end, re.A): + if not re.search(r'\A[0-9]{1,9}Z\Z', end): commentary.error('fractional seconds must have 1-9 digits', field, value) # XXX the above is pretty incomplete for dash, colon, trailing Z, etc @@ -305,13 +305,13 @@ def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: commentary.error('must contain a /', field, value) ctype, rest = value.split('/', 1) - if not re.search(token_re, ctype, re.A): + if not re.search(token_re, ctype): commentary.error('invalid type', field, value) if ';' in rest: subtype, rest = rest.split(';', 1) else: subtype = rest - if not re.search(token_re, subtype, re.A): + if not re.search(token_re, subtype): commentary.error('invalid subtype', field, value) # at this point there can be multiple parameters, @@ -324,13 +324,13 @@ def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: commentary.error('missing algorithm', field, value) algorithm, digest = value.split(':', 1) - if not re.search(token_re, algorithm, re.A): + if not re.search(token_re, algorithm): commentary.error('invalid algorithm', field, value) - if not re.search(token_re, digest, re.A): + if not re.search(token_re, digest): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) pass - if not re.search(digest_re, digest, re.A): + if not re.search(digest_re, digest): commentary.comment('Invalid-looking digest value', field, value) From 2c676db25daf0abe842ddafd5fa058360ae218f0 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 26 Jan 2019 11:08:05 -0800 Subject: [PATCH 08/68] coverage --- .../data/standard-torture-validate-field.warc | 52 ++++++++ .../standard-torture-validate-record.warc | 5 + test/test_tests.py | 123 +++++++++++++++++- warcio/tester.py | 73 +++++++---- 4 files changed, 219 insertions(+), 34 deletions(-) create mode 100644 test/data/standard-torture-validate-field.warc diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc new file mode 100644 index 00000000..2c28d72d --- /dev/null +++ b/test/data/standard-torture-validate-field.warc @@ -0,0 +1,52 @@ +WARC/1.0 +WARC-Target-URI: +WARC-Target-URI: example.com +WARC-Target-URI: ex ample.com +WARC-Target-URI: h<>ttp://example.com/ +WARC-Type: does-not-exist +WARC-Type: CAPITALIZED +WARC-Concurrent-To: http://example.com/ +WARC-Record-ID: +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +Content-Type: asdf +Content-Type: has space/asdf +Content-Type: asdf/has space +Content-Type: asdf/has space;asdf +WARC-Block-Digest: asdf +WARC-Block-Digest: has space:asdf +WARC-Block-Digest: sha1:&$*^&*^#*&^ +WARC-IP-Address: 1.2.3.4.5 +WARC-Truncated: invalid +WARC-Warcinfo-ID: asdf:asdf +WARC-Filename: not-yet-tested +WARC-Profile: asdf +WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest +WARC-Identified-Payload-Type: asdf +WARC-Segment-Origin-ID: http://example.com +WARC-Segment-Number: not-an-integer +WARC-Segment-Number: 0 +WARC-Segment-Number: 1 +WARC-Segment-Number: 2 +WARC-Segment-Total-Length: 0 +WARC-Segment-Total-Length: not-an-integer +WARC-Refers-To-Target-URI: http://example.com +WARC-Refers-To-Date: not-a-date +WARC-Unknown-Field: asdf +Content-Length: 0 + + +WARC/1.1 +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +WARC-Type: invalid +Content-Length: 0 + + +WARC/1.1 +WARC-Type: request +WARC-Segment-Number: 1 +Content-Length: 0 + + +WARC/invalid diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index 5181ea38..d212f370 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -71,6 +71,11 @@ WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified Content-Length: 0 +WARC/1.0 +WARC-Type: conversion +Content-Length: 0 + + WARC/1.0 WARC-Type: continuation WARC-Segment-Number: 1 diff --git a/test/test_tests.py b/test/test_tests.py index 239d2461..19b7e377 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -2,6 +2,7 @@ from . import get_test_file from .test_cli import patch_stdout +from warcio.utils import to_native_str def helper(args, expected_exit_value): @@ -14,14 +15,14 @@ def helper(args, expected_exit_value): finally: assert exit_value == expected_exit_value - return buff.getvalue() + return to_native_str(buff.getvalue()) def remove_before_test_data(s): - ret = b'' + ret = '' for line in s.splitlines(True): - if b'/test/data/' in line: - line = b'test/data/' + line.split(b'/test/data/', 1)[1] + if '/test/data/' in line: + line = 'test/data/' + line.split('/test/data/', 1)[1] ret += line return ret @@ -33,7 +34,7 @@ def test_torture_missing(): args = ['test'] args.extend(files) - expected = b"""\ + expected = """\ test/data/standard-torture-missing.warc WARC-Record-ID None WARC-Type warcinfo @@ -55,7 +56,7 @@ def test_torture_validate_record(): args = ['test'] args.extend(files) - expected = b"""\ + expected = """\ test/data/standard-torture-validate-record.warc WARC-Record-ID None WARC-Type warcinfo @@ -85,6 +86,7 @@ def test_torture_validate_record(): digest not present error: missing required header WARC-Date error: missing required header WARC-Record-ID + error: recource records for dns: shall have Content-Type of text/dns, saw text/plain WARC-Record-ID None WARC-Type resource digest not present @@ -133,6 +135,12 @@ def test_torture_validate_record(): recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified + WARC-Record-ID None + WARC-Type conversion + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI WARC-Record-ID None WARC-Type continuation digest not present @@ -145,5 +153,106 @@ def test_torture_validate_record(): """ value = helper(args, 0) - print(remove_before_test_data(value).decode()) + print(remove_before_test_data(value)) + assert remove_before_test_data(value) == expected + + +def test_torture_validate_field(): + files = ['standard-torture-validate-field.warc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/standard-torture-validate-field.warc + WARC-Record-ID + WARC-Type does-not-exist + unknown hash algorithm name in block digest + error: uri must not be within <> warc-target-uri + error: invalid uri scheme, bad character warc-target-uri + error: duplicate field seen warc-target-uri example.com + error: invalid uri, no scheme warc-target-uri example.com + error: duplicate field seen warc-target-uri ex ample.com + error: invalid uri, no scheme warc-target-uri ex ample.com + error: invalid uri, contains whitespace warc-target-uri ex ample.com + error: invalid uri scheme, bad character warc-target-uri ex ample.com + error: duplicate field seen warc-target-uri h<>ttp://example.com/ + error: invalid uri scheme, bad character warc-target-uri h<>ttp://example.com/ + error: duplicate field seen warc-type CAPITALIZED + error: uri must be within <> warc-concurrent-to http://example.com/ + error: duplicate field seen warc-date 2017-03-06T04:03:53.Z + error: WARC 1.0 may not have fractional seconds warc-date 2017-03-06T04:03:53.Z + error: must contain a / content-type asdf + error: invalid subtype content-type asdf + error: duplicate field seen content-type has space/asdf + error: invalid type content-type has space/asdf + error: duplicate field seen content-type asdf/has space + error: invalid subtype content-type asdf/has space + error: duplicate field seen content-type asdf/has space;asdf + error: invalid subtype content-type asdf/has space;asdf + error: missing algorithm warc-block-digest asdf + error: duplicate field seen warc-block-digest has space:asdf + error: invalid algorithm warc-block-digest has space:asdf + error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^ + error: uri must be within <> warc-warcinfo-id asdf:asdf + error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: must contain a / warc-identified-payload-type asdf + error: invalid subtype warc-identified-payload-type asdf + error: uri must be within <> warc-segment-origin-id http://example.com + error: must be an integer warc-segment-number not-an-integer + error: duplicate field seen warc-segment-number 0 + error: must be 1 or greater warc-segment-number 0 + error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 0 + error: duplicate field seen warc-segment-number 1 + error: duplicate field seen warc-segment-number 2 + error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 2 + error: duplicate field seen warc-segment-total-length not-an-integer + error: must be an integer warc-segment-total-length not-an-integer + comment: unknown WARC-Type warc-type does-not-exist + comment: WARC-Type is not lower-case warc-type CAPITALIZED + comment: unknown WARC-Type warc-type CAPITALIZED + comment: unknown digest algorithm warc-block-digest asdf + comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^ + comment: did not check ip address format, install ipaddress module from pypi if you care + comment: extension seen warc-truncated invalid + comment: extension seen warc-profile asdf + comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0 + comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0 + comment: unknown field, no validation performed WARC-Unknown-Field asdf + WARC-Record-ID None + WARC-Type invalid + digest not present + error: duplicate field seen warc-date 2017-03-06T04:03:53.Z + error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z + comment: unknown WARC-Type warc-type invalid + WARC-Record-ID None + WARC-Type request + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI + recommendation: do not segment WARC-Type request + comment: no configuration seen for WARC-Segment-Number request +""" + + value = helper(args, 0) + print(remove_before_test_data(value)) + assert remove_before_test_data(value) == expected + + +def test_arc(): + files = ['does-not-exist.arc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/does-not-exist.arc +""" + + value = helper(args, 0) assert remove_before_test_data(value) == expected diff --git a/warcio/tester.py b/warcio/tester.py index c978a404..4c2f8299 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -5,15 +5,15 @@ import traceback from warcio.archiveiterator import WARCIterator -from warcio.utils import to_native_str +from warcio.utils import to_native_str, Digester -def try_ipaddress_init(): +def try_ipaddress_import(): # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies. try: import ipaddress except ImportError: # pragma: no cover - pass + print('ipaddress module not imported') class Commentary: @@ -75,10 +75,10 @@ def validate_warc_fields(record, commentary): # field-value = *( field-content | LWS ) # LWS signals continuations # field-name = token # token_re - content = record.content # TESTME + content = record.content try: text = to_native_str(content, 'utf-8', errors='strict') - except UnicodeDecodeError as e: # TESTME + except UnicodeDecodeError as e: commentary.error('warc-fields contains invalid utf-8: '+str(e)) text = to_native_str(content, 'utf-8', errors='replace') @@ -137,7 +137,7 @@ def validate_warcinfo(record, commentary, pending): def validate_response(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() # TESTME + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type', 'none') @@ -154,7 +154,7 @@ def validate_response(record, commentary, pending): def validate_resource(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower() # TESTME + target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower() if target_uri.startswith('dns:'): content_type = record.rec_headers.get_header('Content-Type', 'none') @@ -169,7 +169,7 @@ def validate_resource(record, commentary, pending): def validate_request(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() # TESTME + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type') @@ -186,7 +186,7 @@ def validate_request(record, commentary, pending): def validate_metadata(record, commentary, pending): - content_type = record.rec_headers.get_header('Content-Type', 'none') # TESTME + content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() == 'application/warc-fields': # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it @@ -196,7 +196,7 @@ def validate_metadata(record, commentary, pending): def validate_revisit(record, commentary, pending): - warc_profile = record.rec_headers.get_header('WARC-Profile', 'none') # TESTME + warc_profile = record.rec_headers.get_header('WARC-Profile', 'none') if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): config = { @@ -222,11 +222,11 @@ def validate_revisit(record, commentary, pending): def validate_conversion(record, commentary, pending): # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment? # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To - pass # TESTME + pass def validate_continuation(record, commentary, pending): - commentary.comment('warcio test continuation code has not been tested, expect bugs') # TESTME + commentary.comment('warcio test continuation code has not been tested, expect bugs') segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') if segment_number.isdigit() and int(segment_number) < 2: @@ -240,14 +240,14 @@ def validate_actual_uri(field, value, record, version, commentary, pending): # should use a registered scheme # %XX encoding, normalize to upper case # schemes are case-insensitive and normalize to lower - if value.startswith('<') or value.endswith('>'): # TESTME + if value.startswith('<') or value.endswith('>'): # wget 1.19 bug caused by WARC 1.0 spec error commentary.error('uri must not be within <>', field, value) if ':' not in value: commentary.error('invalid uri, no scheme', field, value) if re.search(r'\s', value): commentary.error('invalid uri, contains whitespace', field, value) - scheme, rest = value.split(':', 1) + scheme = value.split(':', 1)[0] if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): commentary.error('invalid uri scheme, bad character', field, value) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml @@ -282,9 +282,10 @@ def validate_timestamp(field, value, record, version, commentary, pending): # XXX specification infelicity: would be nice to have 'advice to implementers' here commentary.error('WARC 1.0 may not have fractional seconds', field, value) else: - start, end = value.split('.', 1) - if not re.search(r'\A[0-9]{1,9}Z\Z', end): - commentary.error('fractional seconds must have 1-9 digits', field, value) + if '.' in value: + start, end = value.split('.', 1) + if not re.search(r'\A[0-9]{1,9}Z\Z', end): + commentary.error('fractional seconds must have 1-9 digits', field, value) # XXX the above is pretty incomplete for dash, colon, trailing Z, etc @@ -304,7 +305,12 @@ def validate_content_length(field, value, record, version, commentary, pending): def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: commentary.error('must contain a /', field, value) - ctype, rest = value.split('/', 1) + splits = value.split('/', 1) + ctype = splits[0] + if len(splits) > 1: + rest = splits[1] + else: + rest = '' if not re.search(token_re, ctype): commentary.error('invalid type', field, value) if ';' in rest: @@ -323,9 +329,19 @@ def validate_content_type(field, value, record, version, commentary, pending): def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: commentary.error('missing algorithm', field, value) - algorithm, digest = value.split(':', 1) + splits = value.split(':', 1) + algorithm = splits[0] + if len(splits) > 1: + digest = splits[1] + else: + digest = 'none' if not re.search(token_re, algorithm): commentary.error('invalid algorithm', field, value) + else: + try: + Digester(algorithm) + except ValueError: + commentary.comment('unknown digest algorithm', field, value) if not re.search(token_re, digest): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) @@ -389,8 +405,8 @@ def validate_segment_number(field, value, record, version, commentary, pending): if rec_type != 'continuation': if iv != 1: commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value) - elif rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: - commentary.recommendation('do not segment warc-type', warc_type) + if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: + commentary.recommendation('do not segment WARC-Type', rec_type) def validate_segment_total_length(field, value, record, version, commentary, pending): @@ -418,7 +434,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_uri, }, 'WARC-Block-Digest': { - 'validate': validate_digest, # openssl check? or just let check_digest get it? + 'validate': validate_digest, }, 'WARC-Payload-Digest': { 'validate': validate_digest, @@ -487,6 +503,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_resource, }, 'request': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', @@ -577,6 +594,7 @@ def validate_record(record): field = field.lower() if field != 'warc-concurrent-to' and field in seen_fields: commentary.error('duplicate field seen', field, value) + seen_fields.add(field) if field not in warc_fields: commentary.comment('unknown field, no validation performed', field_case, value) continue @@ -588,9 +606,8 @@ def validate_record(record): if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) - # TODO: validate warc types: unknown should get a comment if rec_type not in record_types: - commentary.comment('unknown record type, no validation performed', rec_type) + pass # we print a comment for this elsewhere else: validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary) validate_record_against_rec_type(record_types[rec_type], record, commentary, pending) @@ -614,7 +631,7 @@ def _process_one(warc): record.content # make sure digests are checked # XXX might need to read and digest the raw stream to check digests for chunked encoding? # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes - except Exception: + except Exception: # pragma: no cover # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code print('Caught exception in warcio test analysis code') traceback.print_exc() @@ -643,7 +660,6 @@ class Tester(object): def __init__(self, cmd): self.inputs = cmd.inputs self.exit_value = 0 - try_ipaddress_init() def process_all(self): for warc in self.inputs: @@ -651,9 +667,12 @@ def process_all(self): try: self.process_one(warc) except Exception as e: - print(' saw exception '+str(e).rstrip(), file=sys.stderr) + print(' saw exception '+repr(e).rstrip(), file=sys.stderr) print(' skipping rest of file', file=sys.stderr) return self.exit_value def process_one(self, filename): _process_one(filename) + + +try_ipaddress_import() From 97ee457f02f5b1508577e8d9118703a7b9b814f9 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 27 Jan 2019 15:25:36 -0800 Subject: [PATCH 09/68] py2 testing --- test/test_tests.py | 23 ++++++++++++++++++++--- warcio/tester.py | 14 ++------------ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index 19b7e377..a197c3ba 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -1,8 +1,10 @@ +import six + from warcio.cli import main +from warcio.utils import to_native_str from . import get_test_file from .test_cli import patch_stdout -from warcio.utils import to_native_str def helper(args, expected_exit_value): @@ -154,7 +156,13 @@ def test_torture_validate_record(): value = helper(args, 0) print(remove_before_test_data(value)) - assert remove_before_test_data(value) == expected + + ret = remove_before_test_data(value) + + if six.PY2: + expected = expected.replace('\n error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n') + + assert ret == expected def test_torture_validate_field(): @@ -195,6 +203,7 @@ def test_torture_validate_field(): error: duplicate field seen warc-block-digest has space:asdf error: invalid algorithm warc-block-digest has space:asdf error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^ + error: invalid ip warc-ip-address 1.2.3.4.5 error: uri must be within <> warc-warcinfo-id asdf:asdf error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest error: must contain a / warc-identified-payload-type asdf @@ -214,7 +223,6 @@ def test_torture_validate_field(): comment: unknown WARC-Type warc-type CAPITALIZED comment: unknown digest algorithm warc-block-digest asdf comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^ - comment: did not check ip address format, install ipaddress module from pypi if you care comment: extension seen warc-truncated invalid comment: extension seen warc-profile asdf comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest @@ -240,6 +248,15 @@ def test_torture_validate_field(): value = helper(args, 0) print(remove_before_test_data(value)) + + ret = remove_before_test_data(value) + if six.PY2: + if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret: + # user did not install ipaddress module + expected = expected.replace('\n error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n') + ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') + + assert remove_before_test_data(value) == expected diff --git a/warcio/tester.py b/warcio/tester.py index 4c2f8299..308f35fd 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -8,14 +8,6 @@ from warcio.utils import to_native_str, Digester -def try_ipaddress_import(): - # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies. - try: - import ipaddress - except ImportError: # pragma: no cover - print('ipaddress module not imported') - - class Commentary: def __init__(self, record_id, rec_type): self._record_id = record_id @@ -353,10 +345,11 @@ def validate_digest(field, value, record, version, commentary, pending): def validate_ip(field, value, record, version, commentary, pending): # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291 try: + import ipaddress ipaddress.ip_address(value) except ValueError: commentary.error('invalid ip', field, value) - except NameError: + except (ImportError, NameError): commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') @@ -673,6 +666,3 @@ def process_all(self): def process_one(self, filename): _process_one(filename) - - -try_ipaddress_import() From df50151df2a21b00fa30ed592e6b2536f1367f97 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 27 Jan 2019 15:35:30 -0800 Subject: [PATCH 10/68] py2 windows testing --- test/test_tests.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index a197c3ba..01e72ef4 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -25,6 +25,8 @@ def remove_before_test_data(s): for line in s.splitlines(True): if '/test/data/' in line: line = 'test/data/' + line.split('/test/data/', 1)[1] + if '\\test\\data\\' in line: + line = 'test/data/' + line.split('\\test\\data\\', 1)[1] ret += line return ret @@ -247,17 +249,16 @@ def test_torture_validate_field(): """ value = helper(args, 0) - print(remove_before_test_data(value)) - ret = remove_before_test_data(value) + if six.PY2: if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret: # user did not install ipaddress module expected = expected.replace('\n error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n') ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') - - assert remove_before_test_data(value) == expected + print(ret) + assert ret == expected def test_arc(): From 858a752021640a47f716522bb5ef1fdd30d3fb82 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 27 Jan 2019 23:11:46 -0800 Subject: [PATCH 11/68] coverage --- .../standard-torture-validate-record.warc | 1 + test/test_tests.py | 55 ++++++++++++++++++- warcio/tester.py | 52 +++++++++--------- 3 files changed, 80 insertions(+), 28 deletions(-) diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index d212f370..08a39e50 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -1,6 +1,7 @@ WARC/1.0 WARC-Type: warcinfo Content-Type: application/warc-fields +WARC-Refers-To: probhibited Content-Length: 146 first line can't start with a space diff --git a/test/test_tests.py b/test/test_tests.py index 01e72ef4..0fdecc74 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -2,6 +2,7 @@ from warcio.cli import main from warcio.utils import to_native_str +import warcio.tester from . import get_test_file from .test_cli import patch_stdout @@ -65,8 +66,10 @@ def test_torture_validate_record(): WARC-Record-ID None WARC-Type warcinfo digest not present + error: uri must be within <> warc-refers-to probhibited error: missing required header WARC-Date error: missing required header WARC-Record-ID + error: field not allowed in record_type WARC-Refers-To warcinfo error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte comment: The first line of warc-fields cannot start with whitespace comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n @@ -129,6 +132,7 @@ def test_torture_validate_record(): recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date recommendation: missing recommended header WARC-Refers-To-Target-URI + comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest WARC-Record-ID None WARC-Type revisit digest not present @@ -138,7 +142,6 @@ def test_torture_validate_record(): error: missing required header WARC-Target-URI recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date - comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified WARC-Record-ID None WARC-Type conversion digest not present @@ -227,7 +230,6 @@ def test_torture_validate_field(): comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^ comment: extension seen warc-truncated invalid comment: extension seen warc-profile asdf - comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0 comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0 comment: unknown field, no validation performed WARC-Unknown-Field asdf @@ -274,3 +276,52 @@ def test_arc(): value = helper(args, 0) assert remove_before_test_data(value) == expected + + +def test_digests(): + # needed for test coverage + files = ['example-digest-bad.warc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/example-digest-bad.warc + WARC-Record-ID + WARC-Type request + payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests +""" + + value = helper(args, 0) + assert remove_before_test_data(value) == expected + + +def test_leftovers(): + commentary = warcio.tester.Commentary('id', 'type') + + # hard to test because invalid WARC Content-Length raises in archiveiterator + warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None) + + # hard to test because warcio checks the WARC version + warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) + + expected = '''\ +error: must be an integer content-length not-an-integer +comment: no profile check because unknown warc version blah blah +''' + + assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/warcio/tester.py b/warcio/tester.py index 308f35fd..de9f3ca1 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -2,10 +2,10 @@ import re import sys -import traceback from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester +from warcio.exceptions import ArchiveLoadFailed class Commentary: @@ -196,8 +196,11 @@ def validate_revisit(record, commentary, pending): 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], } validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) - # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired - # recommended that server response headers be preserved "in this manner" + # may have record block; + # if not, shall have Content-Length: 0, + # if yes, should be like a response record, truncated FOR LENGTH ONLY if desired + # recommended that server response headers be preserved "in this manner" + # I suppose that means headers are required if there is any content?! elif warc_profile.endswith('/revisit/server-not-modified'): config = { @@ -205,7 +208,9 @@ def validate_revisit(record, commentary, pending): 'prohibited': ['WARC-Payload-Digest'], } validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) - # may have content body; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated if desired + # may have content body; + # if not, shall have Content-Length: 0, + # if yes, should be like a response record, truncated if desired # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present else: commentary.comment('no revisit details validation done due to unknown profile') @@ -343,13 +348,12 @@ def validate_digest(field, value, record, version, commentary, pending): def validate_ip(field, value, record, version, commentary, pending): - # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291 try: import ipaddress ipaddress.ip_address(value) except ValueError: commentary.error('invalid ip', field, value) - except (ImportError, NameError): + except (ImportError, NameError): # pragma: no cover (for python 2.7) commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') @@ -369,12 +373,14 @@ def validate_filename(field, value, record, version, commentary, pending): profiles = { - '1.0': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', - 'http://netpreserve.org/warc/1.1/revisit/server-not-modified', + # XXX WARC/0.17 and WARC/0.18 + '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.0/revisit/server-not-modified', # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not? + # https://github.com/iipc/webarchive-commons/commits/988bec707c27a01333becfc3bd502af4441ea1e1/src/main/java/org/archive/format/warc/WARCConstants.java 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'], - '1.1': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', - 'http://netpreserve.org/warc/1.0/revisit/server-not-modified'], + '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.1/revisit/server-not-modified'], } @@ -614,21 +620,15 @@ def _process_one(warc): with open(warc, 'rb') as stream: for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): - try: - record = WrapRecord(record) - digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or - record.rec_headers.get_header('WARC-Block-Digest')) + record = WrapRecord(record) + digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or + record.rec_headers.get_header('WARC-Block-Digest')) - commentary = validate_record(record) + commentary = validate_record(record) - record.content # make sure digests are checked - # XXX might need to read and digest the raw stream to check digests for chunked encoding? - # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes - except Exception: # pragma: no cover - # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code - print('Caught exception in warcio test analysis code') - traceback.print_exc() - exit(1) + record.content # make sure digests are checked + # XXX might need to read and digest the raw stream to check digests for chunked encoding? + # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes if commentary.has_comments() or record.digest_checker.passed is False: print(' ', 'WARC-Record-ID', commentary.record_id()) @@ -637,7 +637,7 @@ def _process_one(warc): if record.digest_checker.passed is True: print(' digest pass') elif record.digest_checker.passed is None: - if digest_present: + if digest_present: # pragma: no cover print(' digest present but not checked') else: print(' digest not present') @@ -659,8 +659,8 @@ def process_all(self): print(warc) try: self.process_one(warc) - except Exception as e: - print(' saw exception '+repr(e).rstrip(), file=sys.stderr) + except ArchiveLoadFailed as e: + print(' saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr) print(' skipping rest of file', file=sys.stderr) return self.exit_value From 5bfffea4c2200284cb876c8700b5ac578a4ec544 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 27 Jan 2019 23:57:53 -0800 Subject: [PATCH 12/68] branch coverage --- .../data/standard-torture-validate-field.warc | 1 + .../standard-torture-validate-record.warc | 26 +++++++++++ test/test_tests.py | 44 ++++++++++++++++++- warcio/tester.py | 1 + 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc index 2c28d72d..c88d3ee6 100644 --- a/test/data/standard-torture-validate-field.warc +++ b/test/data/standard-torture-validate-field.warc @@ -39,6 +39,7 @@ Content-Length: 0 WARC/1.1 WARC-Date: 2017-03-06T04:03:53Z WARC-Date: 2017-03-06T04:03:53.Z +WARC-Date: 2017-03-06T04:03:53.0Z WARC-Type: invalid Content-Length: 0 diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index 08a39e50..6f06205e 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -41,9 +41,23 @@ Content-Type: text/dns Content-Length: 0 +WARC/1.0 +WARC-Type: resource +WARC-Target-URI: foo:bar +Content-Length: 0 + + +WARC/1.0 +WARC-Type: request +WARC-Target-URI: hTtP://example.com/ +Content-Type: text/plain +Content-Length: 0 + + WARC/1.0 WARC-Type: request WARC-Target-URI: hTtP://example.com/ +WARC-IP-Address: 1.2.3.4 Content-Type: text/plain Content-Length: 0 @@ -54,6 +68,12 @@ Content-Type: application/warc-fields Content-Length: 0 +WARC/1.0 +WARC-Type: metadata +Content-Type: not-application/warc-fields +Content-Length: 0 + + WARC/1.0 WARC-Type: revisit WARC-Profile: none @@ -83,3 +103,9 @@ WARC-Segment-Number: 1 Content-Length: 0 +WARC/1.0 +WARC-Type: continuation +WARC-Segment-Number: 2 +Content-Length: 0 + + diff --git a/test/test_tests.py b/test/test_tests.py index 0fdecc74..174466c8 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -99,6 +99,12 @@ def test_torture_validate_record(): digest not present error: missing required header WARC-Date error: missing required header WARC-Record-ID + WARC-Record-ID None + WARC-Type resource + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID WARC-Record-ID None WARC-Type request digest not present @@ -106,12 +112,23 @@ def test_torture_validate_record(): error: missing required header WARC-Record-ID error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID None + WARC-Type request + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain WARC-Record-ID None WARC-Type metadata digest not present error: missing required header WARC-Date error: missing required header WARC-Record-ID comment: warc-fields body present but empty + WARC-Record-ID None + WARC-Type metadata + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID WARC-Record-ID None WARC-Type revisit digest not present @@ -157,6 +174,14 @@ def test_torture_validate_record(): error: missing required header WARC-Target-URI error: continuation record must have WARC-Segment-Number > 1, saw 1 comment: warcio test continuation code has not been tested, expect bugs + WARC-Record-ID None + WARC-Type continuation + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Segment-Origin-ID + error: missing required header WARC-Target-URI + comment: warcio test continuation code has not been tested, expect bugs """ value = helper(args, 0) @@ -238,6 +263,7 @@ def test_torture_validate_field(): digest not present error: duplicate field seen warc-date 2017-03-06T04:03:53.Z error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z + error: duplicate field seen warc-date 2017-03-06T04:03:53.0Z comment: unknown WARC-Type warc-type invalid WARC-Record-ID None WARC-Type request @@ -280,7 +306,7 @@ def test_arc(): def test_digests(): # needed for test coverage - files = ['example-digest-bad.warc'] + files = ['example-digest-bad.warc', 'example.warc'] files = [get_test_file(filename) for filename in files] args = ['test'] @@ -304,6 +330,21 @@ def test_digests(): WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests +test/data/example.warc + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type revisit + digest present but not checked + recommendation: missing recommended header WARC-Refers-To + comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com/ 1.0 + comment: field was introduced after this warc version WARC-Refers-To-Date 2017-03-06T04:02:06Z 1.0 + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests """ value = helper(args, 0) @@ -312,6 +353,7 @@ def test_digests(): def test_leftovers(): commentary = warcio.tester.Commentary('id', 'type') + assert not commentary.has_comments() # hard to test because invalid WARC Content-Length raises in archiveiterator warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None) diff --git a/warcio/tester.py b/warcio/tester.py index de9f3ca1..eaf7f09f 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -638,6 +638,7 @@ def _process_one(warc): print(' digest pass') elif record.digest_checker.passed is None: if digest_present: # pragma: no cover + # WARC record missing Content-Length: header, which is verboten print(' digest present but not checked') else: print(' digest not present') From bb31f14707b2019e6b406ffc2f7dc89af418d17e Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 00:05:31 -0800 Subject: [PATCH 13/68] py2 branch coverage --- test/test_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_tests.py b/test/test_tests.py index 174466c8..98517308 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -191,6 +191,7 @@ def test_torture_validate_record(): if six.PY2: expected = expected.replace('\n error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n') + ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') assert ret == expected From cc542596826f3112965107229fd63eaabb077308 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 10:09:17 -0800 Subject: [PATCH 14/68] py2 testing --- setup.py | 17 +++++++++++------ test/test_tests.py | 20 +++++--------------- warcio/tester.py | 26 ++++++++++++++++++++------ 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/setup.py b/setup.py index 3a1dce58..57f402dc 100755 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ from setuptools import setup, find_packages from setuptools.command.test import test as TestCommand import glob +import sys __version__ = '1.7.0.dev0' @@ -21,6 +22,15 @@ def run_tests(self): errcode = pytest.main(['--doctest-module', './warcio', '--cov', 'warcio', '-v', 'test/']) sys.exit(errcode) +tests_require = [ + 'pytest', + 'pytest-cov', + 'httpbin==0.5.0', + 'requests', +] +if sys.version_info < (3, 3): + tests_require.append('ipaddress') + setup( name='warcio', version=__version__, @@ -44,12 +54,7 @@ def run_tests(self): """, cmdclass={'test': PyTest}, test_suite='', - tests_require=[ - 'pytest', - 'pytest-cov', - 'httpbin==0.5.0', - 'requests', - ], + tests_require=tests_require, classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: Web Environment', diff --git a/test/test_tests.py b/test/test_tests.py index 98517308..dab1e669 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -187,13 +187,9 @@ def test_torture_validate_record(): value = helper(args, 0) print(remove_before_test_data(value)) - ret = remove_before_test_data(value) + actual = remove_before_test_data(value) - if six.PY2: - expected = expected.replace('\n error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n') - ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') - - assert ret == expected + assert actual == expected def test_torture_validate_field(): @@ -278,16 +274,10 @@ def test_torture_validate_field(): """ value = helper(args, 0) - ret = remove_before_test_data(value) - - if six.PY2: - if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret: - # user did not install ipaddress module - expected = expected.replace('\n error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n') - ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') + actual = remove_before_test_data(value) - print(ret) - assert ret == expected + print(actual) + assert actual == expected def test_arc(): diff --git a/warcio/tester.py b/warcio/tester.py index eaf7f09f..f00479ff 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -2,6 +2,7 @@ import re import sys +import six from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester @@ -68,11 +69,22 @@ def validate_warc_fields(record, commentary): # field-name = token # token_re content = record.content - try: - text = to_native_str(content, 'utf-8', errors='strict') - except UnicodeDecodeError as e: - commentary.error('warc-fields contains invalid utf-8: '+str(e)) - text = to_native_str(content, 'utf-8', errors='replace') + + if six.PY2: # pragma: no cover + try: + content.decode('utf-8', errors='strict') + text = content # already a str + except UnicodeDecodeError as e: + err = str(e) + err = err.replace('utf8', 'utf-8') # sigh + commentary.error('warc-fields contains invalid utf-8: '+err) + text = content.decode('utf-8', errors='replace') + else: # pragma: no cover + try: + text = to_native_str(content, 'utf-8', errors='strict') + except UnicodeDecodeError as e: + commentary.error('warc-fields contains invalid utf-8: '+str(e)) + text = to_native_str(content, 'utf-8', errors='replace') first_line = True lines = [] @@ -350,10 +362,12 @@ def validate_digest(field, value, record, version, commentary, pending): def validate_ip(field, value, record, version, commentary, pending): try: import ipaddress + if six.PY2: # pragma: no cover + value = unicode(value) ipaddress.ip_address(value) except ValueError: commentary.error('invalid ip', field, value) - except (ImportError, NameError): # pragma: no cover (for python 2.7) + except (ImportError, NameError): # pragma: no cover commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') From 2b8d596701e2805291be053cdd8a051200ae5fe6 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 11:15:57 -0800 Subject: [PATCH 15/68] add record ids to test --- test/data/standard-torture-missing.warc | 5 - .../standard-torture-validate-record.warc | 25 +++++ test/test_tests.py | 91 ++++++++----------- warcio/tester.py | 2 +- 4 files changed, 63 insertions(+), 60 deletions(-) delete mode 100644 test/data/standard-torture-missing.warc diff --git a/test/data/standard-torture-missing.warc b/test/data/standard-torture-missing.warc deleted file mode 100644 index a1ab0714..00000000 --- a/test/data/standard-torture-missing.warc +++ /dev/null @@ -1,5 +0,0 @@ -WARC/1.0 -WARC-Type: warcinfo -Content-Length: 0 - - diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index 6f06205e..fa03b38e 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -15,13 +15,24 @@ token cannot have a space: WARC/1.0 +WARC-Record-ID: test-empty-warc-fields WARC-Type: warcinfo Content-Type: application/warc-fields Content-Length: 0 +WARC/1.0 +WARC-Type: warcinfo +WARC-Record-ID: test-warcinfo-non-recommended-content-type +Content-Type: not-application/warc-fields +Content-Length: 5 + +foo + + WARC/1.0 WARC-Type: response +WARC-Record-ID: test-response-content-type WARC-Target-URI: HtTp://example.com/ Content-Type: text/plain Content-Length: 0 @@ -29,6 +40,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource +WARC-Record-ID: test-resource-dns-content-type WARC-Target-URI: DnS:asdfasdf Content-Type: text/plain Content-Length: 0 @@ -36,6 +48,8 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource +WARC-Record-ID: test-resource-dns-empty +WARC-Test-TODO: add another with valid block WARC-Target-URI: DnS:asdfasdf Content-Type: text/dns Content-Length: 0 @@ -43,12 +57,14 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource +WARC-Record-ID: test-resource-not-dns WARC-Target-URI: foo:bar Content-Length: 0 WARC/1.0 WARC-Type: request +WARC-Record-ID: test-request-unrecommended-content-type WARC-Target-URI: hTtP://example.com/ Content-Type: text/plain Content-Length: 0 @@ -56,6 +72,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: request +WARC-Record-ID: test-request-unrecommended-content-type-with-ip WARC-Target-URI: hTtP://example.com/ WARC-IP-Address: 1.2.3.4 Content-Type: text/plain @@ -64,47 +81,55 @@ Content-Length: 0 WARC/1.0 WARC-Type: metadata +WARC-Record-ID: test-metadata-warc-fields-empty Content-Type: application/warc-fields Content-Length: 0 WARC/1.0 WARC-Type: metadata +WARC-Record-ID: test-metadata-not-warc-fields Content-Type: not-application/warc-fields Content-Length: 0 WARC/1.0 WARC-Type: revisit +WARC-Record-ID: test-revisit-profile-unknown WARC-Profile: none Content-Length: 0 WARC/1.0 WARC-Type: revisit +WARC-Record-ID: test-revisit-profile-future WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest Content-Length: 0 WARC/1.0 WARC-Type: revisit +WARC-Record-ID: test-revisit-profile-good WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified Content-Length: 0 WARC/1.0 WARC-Type: conversion +WARC-Record-ID: test-conversion Content-Length: 0 WARC/1.0 WARC-Type: continuation +WARC-Record-ID: test-continuation-segment-1 WARC-Segment-Number: 1 Content-Length: 0 WARC/1.0 WARC-Type: continuation +WARC-Record-ID: test-continuation-segment-valid WARC-Segment-Number: 2 Content-Length: 0 diff --git a/test/test_tests.py b/test/test_tests.py index dab1e669..723b2bd9 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -1,5 +1,3 @@ -import six - from warcio.cli import main from warcio.utils import to_native_str import warcio.tester @@ -32,28 +30,6 @@ def remove_before_test_data(s): return ret -def test_torture_missing(): - files = ['standard-torture-missing.warc'] - files = [get_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/standard-torture-missing.warc - WARC-Record-ID None - WARC-Type warcinfo - digest not present - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Record-ID - recommendation: warcinfo Content-Type of application/warc-fields, saw none -""" - - value = helper(args, 0) - assert remove_before_test_data(value) == expected - - def test_torture_validate_record(): files = ['standard-torture-validate-record.warc'] files = [get_test_file(filename) for filename in files] @@ -75,110 +51,117 @@ def test_torture_validate_record(): comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n comment: Missing field-name : in warc-fields line: no colon comment: invalid warc-fields name: token cannot have a space - WARC-Record-ID None + WARC-Record-ID test-empty-warc-fields WARC-Type warcinfo digest not present + error: uri must be within <> warc-record-id test-empty-warc-fields error: missing required header WARC-Date - error: missing required header WARC-Record-ID comment: warc-fields body present but empty - WARC-Record-ID None + WARC-Record-ID test-warcinfo-non-recommended-content-type + WARC-Type warcinfo + digest not present + error: uri must be within <> warc-record-id test-warcinfo-non-recommended-content-type + error: missing required header WARC-Date + recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields + WARC-Record-ID test-response-content-type WARC-Type response digest not present + error: uri must be within <> warc-record-id test-response-content-type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain error: WARC-IP-Address should be used for http and https responses - WARC-Record-ID None + WARC-Record-ID test-resource-dns-content-type WARC-Type resource digest not present + error: uri must be within <> warc-record-id test-resource-dns-content-type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: recource records for dns: shall have Content-Type of text/dns, saw text/plain - WARC-Record-ID None + WARC-Record-ID test-resource-dns-empty WARC-Type resource digest not present + error: uri must be within <> warc-record-id test-resource-dns-empty error: missing required header WARC-Date - error: missing required header WARC-Record-ID - WARC-Record-ID None + comment: unknown field, no validation performed WARC-Test-TODO add another with valid block + WARC-Record-ID test-resource-not-dns WARC-Type resource digest not present + error: uri must be within <> warc-record-id test-resource-not-dns error: missing required header Content-Type error: missing required header WARC-Date - error: missing required header WARC-Record-ID - WARC-Record-ID None + WARC-Record-ID test-request-unrecommended-content-type WARC-Type request digest not present + error: uri must be within <> warc-record-id test-request-unrecommended-content-type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain error: WARC-IP-Address should be used for http and https requests - WARC-Record-ID None + WARC-Record-ID test-request-unrecommended-content-type-with-ip WARC-Type request digest not present + error: uri must be within <> warc-record-id test-request-unrecommended-content-type-with-ip error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain - WARC-Record-ID None + WARC-Record-ID test-metadata-warc-fields-empty WARC-Type metadata digest not present + error: uri must be within <> warc-record-id test-metadata-warc-fields-empty error: missing required header WARC-Date - error: missing required header WARC-Record-ID comment: warc-fields body present but empty - WARC-Record-ID None + WARC-Record-ID test-metadata-not-warc-fields WARC-Type metadata digest not present + error: uri must be within <> warc-record-id test-metadata-not-warc-fields error: missing required header WARC-Date - error: missing required header WARC-Record-ID - WARC-Record-ID None + WARC-Record-ID test-revisit-profile-unknown WARC-Type revisit digest not present + error: uri must be within <> warc-record-id test-revisit-profile-unknown error: missing required header Content-Type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Target-URI comment: extension seen warc-profile none comment: no revisit details validation done due to unknown profile - WARC-Record-ID None + WARC-Record-ID test-revisit-profile-future WARC-Type revisit digest not present + error: uri must be within <> warc-record-id test-revisit-profile-future error: missing required header Content-Type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Target-URI error: missing required header WARC-Payload-Digest recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date recommendation: missing recommended header WARC-Refers-To-Target-URI comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest - WARC-Record-ID None + WARC-Record-ID test-revisit-profile-good WARC-Type revisit digest not present + error: uri must be within <> warc-record-id test-revisit-profile-good error: missing required header Content-Type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Target-URI recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date - WARC-Record-ID None + WARC-Record-ID test-conversion WARC-Type conversion digest not present + error: uri must be within <> warc-record-id test-conversion error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Target-URI - WARC-Record-ID None + WARC-Record-ID test-continuation-segment-1 WARC-Type continuation digest not present + error: uri must be within <> warc-record-id test-continuation-segment-1 error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Segment-Origin-ID error: missing required header WARC-Target-URI error: continuation record must have WARC-Segment-Number > 1, saw 1 comment: warcio test continuation code has not been tested, expect bugs - WARC-Record-ID None + WARC-Record-ID test-continuation-segment-valid WARC-Type continuation digest not present + error: uri must be within <> warc-record-id test-continuation-segment-valid error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Segment-Origin-ID error: missing required header WARC-Target-URI comment: warcio test continuation code has not been tested, expect bugs diff --git a/warcio/tester.py b/warcio/tester.py index f00479ff..e9755c8c 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -125,7 +125,7 @@ def validate_warc_fields(record, commentary): def validate_warcinfo(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': - commentary.recommendation('warcinfo Content-Type of application/warc-fields, saw', content_type) + commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw', content_type) else: # format: warc-fields # allowable fields include but not limited to DMCI plus the following From c704fe9886245204158edc89e32da40799e5eaa1 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 12:19:35 -0800 Subject: [PATCH 16/68] preserve capitalization in messages --- test/test_tests.py | 142 ++++++++++++++++++++++----------------------- warcio/tester.py | 15 +++-- 2 files changed, 78 insertions(+), 79 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index 723b2bd9..c922eff1 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -42,7 +42,7 @@ def test_torture_validate_record(): WARC-Record-ID None WARC-Type warcinfo digest not present - error: uri must be within <> warc-refers-to probhibited + error: uri must be within <> WARC-Refers-To probhibited error: missing required header WARC-Date error: missing required header WARC-Record-ID error: field not allowed in record_type WARC-Refers-To warcinfo @@ -54,77 +54,77 @@ def test_torture_validate_record(): WARC-Record-ID test-empty-warc-fields WARC-Type warcinfo digest not present - error: uri must be within <> warc-record-id test-empty-warc-fields + error: uri must be within <> WARC-Record-ID test-empty-warc-fields error: missing required header WARC-Date comment: warc-fields body present but empty WARC-Record-ID test-warcinfo-non-recommended-content-type WARC-Type warcinfo digest not present - error: uri must be within <> warc-record-id test-warcinfo-non-recommended-content-type + error: uri must be within <> WARC-Record-ID test-warcinfo-non-recommended-content-type error: missing required header WARC-Date recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields WARC-Record-ID test-response-content-type WARC-Type response digest not present - error: uri must be within <> warc-record-id test-response-content-type + error: uri must be within <> WARC-Record-ID test-response-content-type error: missing required header WARC-Date error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain error: WARC-IP-Address should be used for http and https responses WARC-Record-ID test-resource-dns-content-type WARC-Type resource digest not present - error: uri must be within <> warc-record-id test-resource-dns-content-type + error: uri must be within <> WARC-Record-ID test-resource-dns-content-type error: missing required header WARC-Date error: recource records for dns: shall have Content-Type of text/dns, saw text/plain WARC-Record-ID test-resource-dns-empty WARC-Type resource digest not present - error: uri must be within <> warc-record-id test-resource-dns-empty + error: uri must be within <> WARC-Record-ID test-resource-dns-empty error: missing required header WARC-Date comment: unknown field, no validation performed WARC-Test-TODO add another with valid block WARC-Record-ID test-resource-not-dns WARC-Type resource digest not present - error: uri must be within <> warc-record-id test-resource-not-dns + error: uri must be within <> WARC-Record-ID test-resource-not-dns error: missing required header Content-Type error: missing required header WARC-Date WARC-Record-ID test-request-unrecommended-content-type WARC-Type request digest not present - error: uri must be within <> warc-record-id test-request-unrecommended-content-type + error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type error: missing required header WARC-Date error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain error: WARC-IP-Address should be used for http and https requests WARC-Record-ID test-request-unrecommended-content-type-with-ip WARC-Type request digest not present - error: uri must be within <> warc-record-id test-request-unrecommended-content-type-with-ip + error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type-with-ip error: missing required header WARC-Date error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain WARC-Record-ID test-metadata-warc-fields-empty WARC-Type metadata digest not present - error: uri must be within <> warc-record-id test-metadata-warc-fields-empty + error: uri must be within <> WARC-Record-ID test-metadata-warc-fields-empty error: missing required header WARC-Date comment: warc-fields body present but empty WARC-Record-ID test-metadata-not-warc-fields WARC-Type metadata digest not present - error: uri must be within <> warc-record-id test-metadata-not-warc-fields + error: uri must be within <> WARC-Record-ID test-metadata-not-warc-fields error: missing required header WARC-Date WARC-Record-ID test-revisit-profile-unknown WARC-Type revisit digest not present - error: uri must be within <> warc-record-id test-revisit-profile-unknown + error: uri must be within <> WARC-Record-ID test-revisit-profile-unknown error: missing required header Content-Type error: missing required header WARC-Date error: missing required header WARC-Target-URI - comment: extension seen warc-profile none + comment: extension seen WARC-Profile none comment: no revisit details validation done due to unknown profile WARC-Record-ID test-revisit-profile-future WARC-Type revisit digest not present - error: uri must be within <> warc-record-id test-revisit-profile-future + error: uri must be within <> WARC-Record-ID test-revisit-profile-future error: missing required header Content-Type error: missing required header WARC-Date error: missing required header WARC-Target-URI @@ -132,11 +132,11 @@ def test_torture_validate_record(): recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date recommendation: missing recommended header WARC-Refers-To-Target-URI - comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + comment: extension seen WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest WARC-Record-ID test-revisit-profile-good WARC-Type revisit digest not present - error: uri must be within <> warc-record-id test-revisit-profile-good + error: uri must be within <> WARC-Record-ID test-revisit-profile-good error: missing required header Content-Type error: missing required header WARC-Date error: missing required header WARC-Target-URI @@ -145,13 +145,13 @@ def test_torture_validate_record(): WARC-Record-ID test-conversion WARC-Type conversion digest not present - error: uri must be within <> warc-record-id test-conversion + error: uri must be within <> WARC-Record-ID test-conversion error: missing required header WARC-Date error: missing required header WARC-Target-URI WARC-Record-ID test-continuation-segment-1 WARC-Type continuation digest not present - error: uri must be within <> warc-record-id test-continuation-segment-1 + error: uri must be within <> WARC-Record-ID test-continuation-segment-1 error: missing required header WARC-Date error: missing required header WARC-Segment-Origin-ID error: missing required header WARC-Target-URI @@ -160,7 +160,7 @@ def test_torture_validate_record(): WARC-Record-ID test-continuation-segment-valid WARC-Type continuation digest not present - error: uri must be within <> warc-record-id test-continuation-segment-valid + error: uri must be within <> WARC-Record-ID test-continuation-segment-valid error: missing required header WARC-Date error: missing required header WARC-Segment-Origin-ID error: missing required header WARC-Target-URI @@ -187,64 +187,64 @@ def test_torture_validate_field(): WARC-Record-ID WARC-Type does-not-exist unknown hash algorithm name in block digest - error: uri must not be within <> warc-target-uri - error: invalid uri scheme, bad character warc-target-uri - error: duplicate field seen warc-target-uri example.com - error: invalid uri, no scheme warc-target-uri example.com - error: duplicate field seen warc-target-uri ex ample.com - error: invalid uri, no scheme warc-target-uri ex ample.com - error: invalid uri, contains whitespace warc-target-uri ex ample.com - error: invalid uri scheme, bad character warc-target-uri ex ample.com - error: duplicate field seen warc-target-uri h<>ttp://example.com/ - error: invalid uri scheme, bad character warc-target-uri h<>ttp://example.com/ - error: duplicate field seen warc-type CAPITALIZED - error: uri must be within <> warc-concurrent-to http://example.com/ - error: duplicate field seen warc-date 2017-03-06T04:03:53.Z - error: WARC 1.0 may not have fractional seconds warc-date 2017-03-06T04:03:53.Z - error: must contain a / content-type asdf - error: invalid subtype content-type asdf - error: duplicate field seen content-type has space/asdf - error: invalid type content-type has space/asdf - error: duplicate field seen content-type asdf/has space - error: invalid subtype content-type asdf/has space - error: duplicate field seen content-type asdf/has space;asdf - error: invalid subtype content-type asdf/has space;asdf - error: missing algorithm warc-block-digest asdf - error: duplicate field seen warc-block-digest has space:asdf - error: invalid algorithm warc-block-digest has space:asdf - error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^ - error: invalid ip warc-ip-address 1.2.3.4.5 - error: uri must be within <> warc-warcinfo-id asdf:asdf - error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest - error: must contain a / warc-identified-payload-type asdf - error: invalid subtype warc-identified-payload-type asdf - error: uri must be within <> warc-segment-origin-id http://example.com - error: must be an integer warc-segment-number not-an-integer - error: duplicate field seen warc-segment-number 0 - error: must be 1 or greater warc-segment-number 0 - error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 0 - error: duplicate field seen warc-segment-number 1 - error: duplicate field seen warc-segment-number 2 - error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 2 - error: duplicate field seen warc-segment-total-length not-an-integer - error: must be an integer warc-segment-total-length not-an-integer - comment: unknown WARC-Type warc-type does-not-exist - comment: WARC-Type is not lower-case warc-type CAPITALIZED - comment: unknown WARC-Type warc-type CAPITALIZED - comment: unknown digest algorithm warc-block-digest asdf - comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^ - comment: extension seen warc-truncated invalid - comment: extension seen warc-profile asdf + error: uri must not be within <> WARC-Target-URI + error: invalid uri scheme, bad character WARC-Target-URI + error: duplicate field seen WARC-Target-URI example.com + error: invalid uri, no scheme WARC-Target-URI example.com + error: duplicate field seen WARC-Target-URI ex ample.com + error: invalid uri, no scheme WARC-Target-URI ex ample.com + error: invalid uri, contains whitespace WARC-Target-URI ex ample.com + error: invalid uri scheme, bad character WARC-Target-URI ex ample.com + error: duplicate field seen WARC-Target-URI h<>ttp://example.com/ + error: invalid uri scheme, bad character WARC-Target-URI h<>ttp://example.com/ + error: duplicate field seen WARC-Type CAPITALIZED + error: uri must be within <> WARC-Concurrent-To http://example.com/ + error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z + error: WARC 1.0 may not have fractional seconds WARC-Date 2017-03-06T04:03:53.Z + error: must contain a / Content-Type asdf + error: invalid subtype Content-Type asdf + error: duplicate field seen Content-Type has space/asdf + error: invalid type Content-Type has space/asdf + error: duplicate field seen Content-Type asdf/has space + error: invalid subtype Content-Type asdf/has space + error: duplicate field seen Content-Type asdf/has space;asdf + error: invalid subtype Content-Type asdf/has space;asdf + error: missing algorithm WARC-Block-Digest asdf + error: duplicate field seen WARC-Block-Digest has space:asdf + error: invalid algorithm WARC-Block-Digest has space:asdf + error: duplicate field seen WARC-Block-Digest sha1:&$*^&*^#*&^ + error: invalid ip WARC-IP-Address 1.2.3.4.5 + error: uri must be within <> WARC-Warcinfo-ID asdf:asdf + error: duplicate field seen WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: must contain a / WARC-Identified-Payload-Type asdf + error: invalid subtype WARC-Identified-Payload-Type asdf + error: uri must be within <> WARC-Segment-Origin-ID http://example.com + error: must be an integer WARC-Segment-Number not-an-integer + error: duplicate field seen WARC-Segment-Number 0 + error: must be 1 or greater WARC-Segment-Number 0 + error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 0 + error: duplicate field seen WARC-Segment-Number 1 + error: duplicate field seen WARC-Segment-Number 2 + error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 2 + error: duplicate field seen WARC-Segment-Total-Length not-an-integer + error: must be an integer WARC-Segment-Total-Length not-an-integer + comment: unknown WARC-Type WARC-Type does-not-exist + comment: WARC-Type is not lower-case WARC-Type CAPITALIZED + comment: unknown WARC-Type WARC-Type CAPITALIZED + comment: unknown digest algorithm WARC-Block-Digest asdf + comment: Invalid-looking digest value WARC-Block-Digest sha1:&$*^&*^#*&^ + comment: extension seen WARC-Truncated invalid + comment: extension seen WARC-Profile asdf comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0 comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0 comment: unknown field, no validation performed WARC-Unknown-Field asdf WARC-Record-ID None WARC-Type invalid digest not present - error: duplicate field seen warc-date 2017-03-06T04:03:53.Z - error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z - error: duplicate field seen warc-date 2017-03-06T04:03:53.0Z - comment: unknown WARC-Type warc-type invalid + error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z + error: fractional seconds must have 1-9 digits WARC-Date 2017-03-06T04:03:53.Z + error: duplicate field seen WARC-Date 2017-03-06T04:03:53.0Z + comment: unknown WARC-Type WARC-Type invalid WARC-Record-ID None WARC-Type request digest not present diff --git a/warcio/tester.py b/warcio/tester.py index e9755c8c..2300d062 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -603,19 +603,18 @@ def validate_record(record): seen_fields = set() for field, value in record.rec_headers.headers: - field_case = field - field = field.lower() - if field != 'warc-concurrent-to' and field in seen_fields: + field_l = field.lower() + if field != 'warc-concurrent-to' and field_l in seen_fields: commentary.error('duplicate field seen', field, value) - seen_fields.add(field) - if field not in warc_fields: - commentary.comment('unknown field, no validation performed', field_case, value) + seen_fields.add(field_l) + if field_l not in warc_fields: + commentary.comment('unknown field, no validation performed', field, value) continue - config = warc_fields[field] + config = warc_fields[field_l] if 'minver' in config: if version < config['minver']: # unknown fields are extensions, so this is a comment and not an error - commentary.comment('field was introduced after this warc version', field_case, value, version) + commentary.comment('field was introduced after this warc version', field, value, version) if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) From 3839fa16bcf1ec58053379e4b695314d72e9afd6 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 12:55:48 -0800 Subject: [PATCH 17/68] capitals and colons --- test/test_tests.py | 264 ++++++++++++++++++++++----------------------- warcio/tester.py | 88 +++++++-------- 2 files changed, 177 insertions(+), 175 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index c922eff1..91eba656 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -42,128 +42,128 @@ def test_torture_validate_record(): WARC-Record-ID None WARC-Type warcinfo digest not present - error: uri must be within <> WARC-Refers-To probhibited - error: missing required header WARC-Date - error: missing required header WARC-Record-ID - error: field not allowed in record_type WARC-Refers-To warcinfo + error: uri must be within <>: WARC-Refers-To probhibited + error: missing required header: WARC-Date + error: missing required header: WARC-Record-ID + error: field not allowed in record type: warcinfo WARC-Refers-To error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte comment: The first line of warc-fields cannot start with whitespace comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n - comment: Missing field-name : in warc-fields line: no colon - comment: invalid warc-fields name: token cannot have a space + comment: Missing colon in warc-fields line: no colon + comment: Invalid warc-fields name: token cannot have a space WARC-Record-ID test-empty-warc-fields WARC-Type warcinfo digest not present - error: uri must be within <> WARC-Record-ID test-empty-warc-fields - error: missing required header WARC-Date + error: uri must be within <>: WARC-Record-ID test-empty-warc-fields + error: missing required header: WARC-Date comment: warc-fields body present but empty WARC-Record-ID test-warcinfo-non-recommended-content-type WARC-Type warcinfo digest not present - error: uri must be within <> WARC-Record-ID test-warcinfo-non-recommended-content-type - error: missing required header WARC-Date - recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields + error: uri must be within <>: WARC-Record-ID test-warcinfo-non-recommended-content-type + error: missing required header: WARC-Date + recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw: not-application/warc-fields WARC-Record-ID test-response-content-type WARC-Type response digest not present - error: uri must be within <> WARC-Record-ID test-response-content-type - error: missing required header WARC-Date - error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain + error: uri must be within <>: WARC-Record-ID test-response-content-type + error: missing required header: WARC-Date + error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw: text/plain error: WARC-IP-Address should be used for http and https responses WARC-Record-ID test-resource-dns-content-type WARC-Type resource digest not present - error: uri must be within <> WARC-Record-ID test-resource-dns-content-type - error: missing required header WARC-Date - error: recource records for dns: shall have Content-Type of text/dns, saw text/plain + error: uri must be within <>: WARC-Record-ID test-resource-dns-content-type + error: missing required header: WARC-Date + error: recource records for dns: shall have Content-Type of text/dns, saw: text/plain WARC-Record-ID test-resource-dns-empty WARC-Type resource digest not present - error: uri must be within <> WARC-Record-ID test-resource-dns-empty - error: missing required header WARC-Date - comment: unknown field, no validation performed WARC-Test-TODO add another with valid block + error: uri must be within <>: WARC-Record-ID test-resource-dns-empty + error: missing required header: WARC-Date + comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block WARC-Record-ID test-resource-not-dns WARC-Type resource digest not present - error: uri must be within <> WARC-Record-ID test-resource-not-dns - error: missing required header Content-Type - error: missing required header WARC-Date + error: uri must be within <>: WARC-Record-ID test-resource-not-dns + error: missing required header: Content-Type + error: missing required header: WARC-Date WARC-Record-ID test-request-unrecommended-content-type WARC-Type request digest not present - error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type - error: missing required header WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain + error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type + error: missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain error: WARC-IP-Address should be used for http and https requests WARC-Record-ID test-request-unrecommended-content-type-with-ip WARC-Type request digest not present - error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type-with-ip - error: missing required header WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain + error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type-with-ip + error: missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain WARC-Record-ID test-metadata-warc-fields-empty WARC-Type metadata digest not present - error: uri must be within <> WARC-Record-ID test-metadata-warc-fields-empty - error: missing required header WARC-Date + error: uri must be within <>: WARC-Record-ID test-metadata-warc-fields-empty + error: missing required header: WARC-Date comment: warc-fields body present but empty WARC-Record-ID test-metadata-not-warc-fields WARC-Type metadata digest not present - error: uri must be within <> WARC-Record-ID test-metadata-not-warc-fields - error: missing required header WARC-Date + error: uri must be within <>: WARC-Record-ID test-metadata-not-warc-fields + error: missing required header: WARC-Date WARC-Record-ID test-revisit-profile-unknown WARC-Type revisit digest not present - error: uri must be within <> WARC-Record-ID test-revisit-profile-unknown - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Target-URI - comment: extension seen WARC-Profile none - comment: no revisit details validation done due to unknown profile + error: uri must be within <>: WARC-Record-ID test-revisit-profile-unknown + error: missing required header: Content-Type + error: missing required header: WARC-Date + error: missing required header: WARC-Target-URI + comment: extension seen: WARC-Profile none + comment: no revisit details validation done due to unknown profile: none WARC-Record-ID test-revisit-profile-future WARC-Type revisit digest not present - error: uri must be within <> WARC-Record-ID test-revisit-profile-future - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Target-URI - error: missing required header WARC-Payload-Digest - recommendation: missing recommended header WARC-Refers-To - recommendation: missing recommended header WARC-Refers-To-Date - recommendation: missing recommended header WARC-Refers-To-Target-URI - comment: extension seen WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + error: uri must be within <>: WARC-Record-ID test-revisit-profile-future + error: missing required header: Content-Type + error: missing required header: WARC-Date + error: missing required header: WARC-Target-URI + error: missing required header: WARC-Payload-Digest + recommendation: missing recommended header: WARC-Refers-To + recommendation: missing recommended header: WARC-Refers-To-Date + recommendation: missing recommended header: WARC-Refers-To-Target-URI + comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest WARC-Record-ID test-revisit-profile-good WARC-Type revisit digest not present - error: uri must be within <> WARC-Record-ID test-revisit-profile-good - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Target-URI - recommendation: missing recommended header WARC-Refers-To - recommendation: missing recommended header WARC-Refers-To-Date + error: uri must be within <>: WARC-Record-ID test-revisit-profile-good + error: missing required header: Content-Type + error: missing required header: WARC-Date + error: missing required header: WARC-Target-URI + recommendation: missing recommended header: WARC-Refers-To + recommendation: missing recommended header: WARC-Refers-To-Date WARC-Record-ID test-conversion WARC-Type conversion digest not present - error: uri must be within <> WARC-Record-ID test-conversion - error: missing required header WARC-Date - error: missing required header WARC-Target-URI + error: uri must be within <>: WARC-Record-ID test-conversion + error: missing required header: WARC-Date + error: missing required header: WARC-Target-URI WARC-Record-ID test-continuation-segment-1 WARC-Type continuation digest not present - error: uri must be within <> WARC-Record-ID test-continuation-segment-1 - error: missing required header WARC-Date - error: missing required header WARC-Segment-Origin-ID - error: missing required header WARC-Target-URI - error: continuation record must have WARC-Segment-Number > 1, saw 1 + error: uri must be within <>: WARC-Record-ID test-continuation-segment-1 + error: missing required header: WARC-Date + error: missing required header: WARC-Segment-Origin-ID + error: missing required header: WARC-Target-URI + error: continuation record must have WARC-Segment-Number > 1, saw: 1 comment: warcio test continuation code has not been tested, expect bugs WARC-Record-ID test-continuation-segment-valid WARC-Type continuation digest not present - error: uri must be within <> WARC-Record-ID test-continuation-segment-valid - error: missing required header WARC-Date - error: missing required header WARC-Segment-Origin-ID - error: missing required header WARC-Target-URI + error: uri must be within <>: WARC-Record-ID test-continuation-segment-valid + error: missing required header: WARC-Date + error: missing required header: WARC-Segment-Origin-ID + error: missing required header: WARC-Target-URI comment: warcio test continuation code has not been tested, expect bugs """ @@ -187,73 +187,73 @@ def test_torture_validate_field(): WARC-Record-ID WARC-Type does-not-exist unknown hash algorithm name in block digest - error: uri must not be within <> WARC-Target-URI - error: invalid uri scheme, bad character WARC-Target-URI - error: duplicate field seen WARC-Target-URI example.com - error: invalid uri, no scheme WARC-Target-URI example.com - error: duplicate field seen WARC-Target-URI ex ample.com - error: invalid uri, no scheme WARC-Target-URI ex ample.com - error: invalid uri, contains whitespace WARC-Target-URI ex ample.com - error: invalid uri scheme, bad character WARC-Target-URI ex ample.com - error: duplicate field seen WARC-Target-URI h<>ttp://example.com/ - error: invalid uri scheme, bad character WARC-Target-URI h<>ttp://example.com/ - error: duplicate field seen WARC-Type CAPITALIZED - error: uri must be within <> WARC-Concurrent-To http://example.com/ - error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z - error: WARC 1.0 may not have fractional seconds WARC-Date 2017-03-06T04:03:53.Z - error: must contain a / Content-Type asdf - error: invalid subtype Content-Type asdf - error: duplicate field seen Content-Type has space/asdf - error: invalid type Content-Type has space/asdf - error: duplicate field seen Content-Type asdf/has space - error: invalid subtype Content-Type asdf/has space - error: duplicate field seen Content-Type asdf/has space;asdf - error: invalid subtype Content-Type asdf/has space;asdf - error: missing algorithm WARC-Block-Digest asdf - error: duplicate field seen WARC-Block-Digest has space:asdf - error: invalid algorithm WARC-Block-Digest has space:asdf - error: duplicate field seen WARC-Block-Digest sha1:&$*^&*^#*&^ - error: invalid ip WARC-IP-Address 1.2.3.4.5 - error: uri must be within <> WARC-Warcinfo-ID asdf:asdf - error: duplicate field seen WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest - error: must contain a / WARC-Identified-Payload-Type asdf - error: invalid subtype WARC-Identified-Payload-Type asdf - error: uri must be within <> WARC-Segment-Origin-ID http://example.com - error: must be an integer WARC-Segment-Number not-an-integer - error: duplicate field seen WARC-Segment-Number 0 - error: must be 1 or greater WARC-Segment-Number 0 - error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 0 - error: duplicate field seen WARC-Segment-Number 1 - error: duplicate field seen WARC-Segment-Number 2 - error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 2 - error: duplicate field seen WARC-Segment-Total-Length not-an-integer - error: must be an integer WARC-Segment-Total-Length not-an-integer - comment: unknown WARC-Type WARC-Type does-not-exist - comment: WARC-Type is not lower-case WARC-Type CAPITALIZED - comment: unknown WARC-Type WARC-Type CAPITALIZED - comment: unknown digest algorithm WARC-Block-Digest asdf - comment: Invalid-looking digest value WARC-Block-Digest sha1:&$*^&*^#*&^ - comment: extension seen WARC-Truncated invalid - comment: extension seen WARC-Profile asdf - comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0 - comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0 - comment: unknown field, no validation performed WARC-Unknown-Field asdf + error: uri must not be within <>: WARC-Target-URI + error: invalid uri scheme, bad character: WARC-Target-URI + error: duplicate field seen: WARC-Target-URI example.com + error: invalid uri, no scheme: WARC-Target-URI example.com + error: duplicate field seen: WARC-Target-URI ex ample.com + error: invalid uri, no scheme: WARC-Target-URI ex ample.com + error: invalid uri, contains whitespace: WARC-Target-URI ex ample.com + error: invalid uri scheme, bad character: WARC-Target-URI ex ample.com + error: duplicate field seen: WARC-Target-URI h<>ttp://example.com/ + error: invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/ + error: duplicate field seen: WARC-Type CAPITALIZED + error: uri must be within <>: WARC-Concurrent-To http://example.com/ + error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: WARC 1.0 time may not have fractional seconds: WARC-Date 2017-03-06T04:03:53.Z + error: must contain a /: Content-Type asdf + error: invalid subtype: Content-Type asdf + error: duplicate field seen: Content-Type has space/asdf + error: invalid type: Content-Type has space/asdf + error: duplicate field seen: Content-Type asdf/has space + error: invalid subtype: Content-Type asdf/has space + error: duplicate field seen: Content-Type asdf/has space;asdf + error: invalid subtype: Content-Type asdf/has space;asdf + error: missing algorithm: WARC-Block-Digest asdf + error: duplicate field seen: WARC-Block-Digest has space:asdf + error: invalid algorithm: WARC-Block-Digest has space:asdf + error: duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^ + error: invalid ip: WARC-IP-Address 1.2.3.4.5 + error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf + error: duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: must contain a /: WARC-Identified-Payload-Type asdf + error: invalid subtype: WARC-Identified-Payload-Type asdf + error: uri must be within <>: WARC-Segment-Origin-ID http://example.com + error: must be an integer: WARC-Segment-Number not-an-integer + error: duplicate field seen: WARC-Segment-Number 0 + error: must be 1 or greater: WARC-Segment-Number 0 + error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0 + error: duplicate field seen: WARC-Segment-Number 1 + error: duplicate field seen: WARC-Segment-Number 2 + error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 + error: duplicate field seen: WARC-Segment-Total-Length not-an-integer + error: must be an integer: WARC-Segment-Total-Length not-an-integer + comment: unknown WARC-Type: WARC-Type does-not-exist + comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED + comment: unknown WARC-Type: WARC-Type CAPITALIZED + comment: unknown digest algorithm: WARC-Block-Digest asdf + comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ + comment: extension seen: WARC-Truncated invalid + comment: extension seen: WARC-Profile asdf + comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com + comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date + comment: unknown field, no validation performed: WARC-Unknown-Field asdf WARC-Record-ID None WARC-Type invalid digest not present - error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z - error: fractional seconds must have 1-9 digits WARC-Date 2017-03-06T04:03:53.Z - error: duplicate field seen WARC-Date 2017-03-06T04:03:53.0Z - comment: unknown WARC-Type WARC-Type invalid + error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: fractional seconds must have 1-9 digits: WARC-Date 2017-03-06T04:03:53.Z + error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z + comment: unknown WARC-Type: WARC-Type invalid WARC-Record-ID None WARC-Type request digest not present - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Record-ID - error: missing required header WARC-Target-URI + error: missing required header: Content-Type + error: missing required header: WARC-Date + error: missing required header: WARC-Record-ID + error: missing required header: WARC-Target-URI recommendation: do not segment WARC-Type request - comment: no configuration seen for WARC-Segment-Number request + comment: Unknown field for this record type, perhaps an extension: request WARC-Segment-Number """ value = helper(args, 0) @@ -312,9 +312,9 @@ def test_digests(): WARC-Record-ID WARC-Type revisit digest present but not checked - recommendation: missing recommended header WARC-Refers-To - comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com/ 1.0 - comment: field was introduced after this warc version WARC-Refers-To-Date 2017-03-06T04:02:06Z 1.0 + recommendation: missing recommended header: WARC-Refers-To + comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ + comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z WARC-Record-ID WARC-Type request digest not present @@ -330,14 +330,14 @@ def test_leftovers(): assert not commentary.has_comments() # hard to test because invalid WARC Content-Length raises in archiveiterator - warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None) + warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) # hard to test because warcio checks the WARC version warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) expected = '''\ -error: must be an integer content-length not-an-integer -comment: no profile check because unknown warc version blah blah +error: must be an integer: Content-Length not-an-integer +comment: no profile check because unknown warc version: blah blah ''' assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/warcio/tester.py b/warcio/tester.py index 2300d062..4ee05f1f 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -59,7 +59,8 @@ def __getattr__(self, name): def canon_content_type(s): - return s.lower().replace('; ', ';') + # wget omits the space after the ;, let that pass + return s.lower().replace(';msgtype=', '; msgtype=') def validate_warc_fields(record, commentary): @@ -106,11 +107,11 @@ def validate_warc_fields(record, commentary): else: # check for field-name : if ':' not in line: - commentary.comment('Missing field-name : in warc-fields line:', line) + commentary.comment('Missing colon in warc-fields line:', line) else: field_name = line.split(':', 1)[0] if not re.search(token_re, field_name): - commentary.comment('invalid warc-fields name:', field_name) + commentary.comment('Invalid warc-fields name:', field_name) else: lines.append(line) first_line = False @@ -125,7 +126,7 @@ def validate_warc_fields(record, commentary): def validate_warcinfo(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': - commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw', content_type) + commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw:', content_type) else: # format: warc-fields # allowable fields include but not limited to DMCI plus the following @@ -145,8 +146,8 @@ def validate_response(record, commentary, pending): if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type', 'none') - if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}: - commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw', content_type) + if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}: + commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') @@ -163,7 +164,7 @@ def validate_resource(record, commentary, pending): if target_uri.startswith('dns:'): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'text/dns': - commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type) + commentary.error('recource records for dns: shall have Content-Type of text/dns, saw:', content_type) else: # rfc 2540 and rfc 1035 #validate_text_dns() @@ -178,8 +179,8 @@ def validate_request(record, commentary, pending): if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type') - if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}: - commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw', content_type) + if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}: + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https requests') @@ -225,7 +226,7 @@ def validate_revisit(record, commentary, pending): # if yes, should be like a response record, truncated if desired # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present else: - commentary.comment('no revisit details validation done due to unknown profile') + commentary.comment('no revisit details validation done due to unknown profile:', warc_profile) def validate_conversion(record, commentary, pending): @@ -239,7 +240,7 @@ def validate_continuation(record, commentary, pending): segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') if segment_number.isdigit() and int(segment_number) < 2: - commentary.error('continuation record must have WARC-Segment-Number > 1, saw', segment_number) + commentary.error('continuation record must have WARC-Segment-Number > 1, saw:', segment_number) # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated @@ -251,30 +252,30 @@ def validate_actual_uri(field, value, record, version, commentary, pending): # schemes are case-insensitive and normalize to lower if value.startswith('<') or value.endswith('>'): # wget 1.19 bug caused by WARC 1.0 spec error - commentary.error('uri must not be within <>', field, value) + commentary.error('uri must not be within <>:', field, value) if ':' not in value: - commentary.error('invalid uri, no scheme', field, value) + commentary.error('invalid uri, no scheme:', field, value) if re.search(r'\s', value): - commentary.error('invalid uri, contains whitespace', field, value) + commentary.error('invalid uri, contains whitespace:', field, value) scheme = value.split(':', 1)[0] if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): - commentary.error('invalid uri scheme, bad character', field, value) + commentary.error('invalid uri scheme, bad character:', field, value) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml def validate_warc_type(field, value, record, version, commentary, pending): if not value.islower(): # I am unclear if this is allowed? standard is silent - commentary.comment('WARC-Type is not lower-case', field, value) + commentary.comment('WARC-Type is not lower-case:', field, value) if value.lower() not in record_types: # standard says readers should ignore unknown warc-types - commentary.comment('unknown WARC-Type', field, value) + commentary.comment('unknown WARC-Type:', field, value) def validate_uri(field, value, record, version, commentary, pending): # < uri > if not (value.startswith('<') and value.endswith('>')): - commentary.error('uri must be within <>', field, value) + commentary.error('uri must be within <>:', field, value) return validate_actual_uri(field, value[1:-1], record, version, commentary, pending) @@ -289,12 +290,12 @@ def validate_timestamp(field, value, record, version, commentary, pending): if not use_ms: if '.' in value: # XXX specification infelicity: would be nice to have 'advice to implementers' here - commentary.error('WARC 1.0 may not have fractional seconds', field, value) + commentary.error('WARC 1.0 time may not have fractional seconds:', field, value) else: if '.' in value: start, end = value.split('.', 1) if not re.search(r'\A[0-9]{1,9}Z\Z', end): - commentary.error('fractional seconds must have 1-9 digits', field, value) + commentary.error('fractional seconds must have 1-9 digits:', field, value) # XXX the above is pretty incomplete for dash, colon, trailing Z, etc @@ -304,7 +305,7 @@ def validate_timestamp(field, value, record, version, commentary, pending): def validate_content_length(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer', field, value) + commentary.error('must be an integer:', field, value) token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z' @@ -313,7 +314,7 @@ def validate_content_length(field, value, record, version, commentary, pending): def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: - commentary.error('must contain a /', field, value) + commentary.error('must contain a /:', field, value) splits = value.split('/', 1) ctype = splits[0] if len(splits) > 1: @@ -321,13 +322,13 @@ def validate_content_type(field, value, record, version, commentary, pending): else: rest = '' if not re.search(token_re, ctype): - commentary.error('invalid type', field, value) + commentary.error('invalid type:', field, value) if ';' in rest: subtype, rest = rest.split(';', 1) else: subtype = rest if not re.search(token_re, subtype): - commentary.error('invalid subtype', field, value) + commentary.error('invalid subtype:', field, value) # at this point there can be multiple parameters, # some of which could have quoted string values with ; in them @@ -337,7 +338,7 @@ def validate_content_type(field, value, record, version, commentary, pending): def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: - commentary.error('missing algorithm', field, value) + commentary.error('missing algorithm:', field, value) splits = value.split(':', 1) algorithm = splits[0] if len(splits) > 1: @@ -345,18 +346,19 @@ def validate_digest(field, value, record, version, commentary, pending): else: digest = 'none' if not re.search(token_re, algorithm): - commentary.error('invalid algorithm', field, value) + commentary.error('invalid algorithm:', field, value) else: try: Digester(algorithm) except ValueError: - commentary.comment('unknown digest algorithm', field, value) + commentary.comment('unknown digest algorithm:', field, value) if not re.search(token_re, digest): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) pass if not re.search(digest_re, digest): - commentary.comment('Invalid-looking digest value', field, value) + # suggested in https://github.com/iipc/warc-specifications/issues/48 + commentary.comment('Invalid-looking digest value:', field, value) def validate_ip(field, value, record, version, commentary, pending): @@ -366,14 +368,14 @@ def validate_ip(field, value, record, version, commentary, pending): value = unicode(value) ipaddress.ip_address(value) except ValueError: - commentary.error('invalid ip', field, value) + commentary.error('invalid ip:', field, value) except (ImportError, NameError): # pragma: no cover commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') def validate_truncated(field, value, record, version, commentary, pending): if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: - commentary.comment('extension seen', field, value) + commentary.comment('extension seen:', field, value) def validate_warcinfo_id(field, value, record, version, commentary, pending): @@ -400,31 +402,31 @@ def validate_filename(field, value, record, version, commentary, pending): def validate_profile(field, value, record, version, commentary, pending): if version not in profiles: - commentary.comment('no profile check because unknown warc version', field, value) + commentary.comment('no profile check because unknown warc version:', field, value) return if value not in profiles[version]: - commentary.comment('extension seen', field, value) + commentary.comment('extension seen:', field, value) def validate_segment_number(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer', field, value) + commentary.error('must be an integer:', field, value) return iv = int(value) if iv == 0: - commentary.error('must be 1 or greater', field, value) + commentary.error('must be 1 or greater:', field, value) rec_type = record.rec_headers.get_header('WARC-Type', 'none') if rec_type != 'continuation': if iv != 1: - commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value) + commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value) if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: commentary.recommendation('do not segment WARC-Type', rec_type) def validate_segment_total_length(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer', field, value) + commentary.error('must be an integer:', field, value) warc_fields = { @@ -568,21 +570,21 @@ def make_header_set(config, kinds): def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): for req in sorted(config.get('required', [])): if not rec_headers.get_header(req): - commentary.error('missing required header', req) + commentary.error('missing required header:', req) for rec in sorted(config.get('recommended', [])): if not rec_headers.get_header(rec): - commentary.recommendation('missing recommended header', rec) + commentary.recommendation('missing recommended header:', rec) allowed = make_header_set(config, ('required', 'optional', 'recommended')) prohibited = make_header_set(config, ('prohibited',)) for field, value in rec_headers.headers: fl = field.lower() if fl in prohibited: - commentary.error('field not allowed in record_type', field, rec_type) + commentary.error('field not allowed in record type:', rec_type, field) elif allow_all or fl in allowed: pass elif fl in warc_fields: - commentary.comment('no configuration seen for', field, rec_type) + commentary.comment('Unknown field for this record type, perhaps an extension:', rec_type, field) else: # an 'unknown field' comment has already been issued in validate_record pass @@ -605,16 +607,16 @@ def validate_record(record): for field, value in record.rec_headers.headers: field_l = field.lower() if field != 'warc-concurrent-to' and field_l in seen_fields: - commentary.error('duplicate field seen', field, value) + commentary.error('duplicate field seen:', field, value) seen_fields.add(field_l) if field_l not in warc_fields: - commentary.comment('unknown field, no validation performed', field, value) + commentary.comment('unknown field, no validation performed:', field, value) continue config = warc_fields[field_l] if 'minver' in config: if version < config['minver']: # unknown fields are extensions, so this is a comment and not an error - commentary.comment('field was introduced after this warc version', field, value, version) + commentary.comment('field was introduced after this warc version:', version, field, value) if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) From 8b9032d64251c773f9f6b8b82ec15d36aa1959f9 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 13:15:30 -0800 Subject: [PATCH 18/68] use valid record ids --- .../data/standard-torture-validate-field.warc | 106 +++++++++--------- .../standard-torture-validate-record.warc | 32 +++--- test/test_tests.py | 62 ++++------ warcio/tester.py | 30 ++--- 4 files changed, 107 insertions(+), 123 deletions(-) diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc index c88d3ee6..816413be 100644 --- a/test/data/standard-torture-validate-field.warc +++ b/test/data/standard-torture-validate-field.warc @@ -1,53 +1,53 @@ -WARC/1.0 -WARC-Target-URI: -WARC-Target-URI: example.com -WARC-Target-URI: ex ample.com -WARC-Target-URI: h<>ttp://example.com/ -WARC-Type: does-not-exist -WARC-Type: CAPITALIZED -WARC-Concurrent-To: http://example.com/ -WARC-Record-ID: -WARC-Date: 2017-03-06T04:03:53Z -WARC-Date: 2017-03-06T04:03:53.Z -Content-Type: asdf -Content-Type: has space/asdf -Content-Type: asdf/has space -Content-Type: asdf/has space;asdf -WARC-Block-Digest: asdf -WARC-Block-Digest: has space:asdf -WARC-Block-Digest: sha1:&$*^&*^#*&^ -WARC-IP-Address: 1.2.3.4.5 -WARC-Truncated: invalid -WARC-Warcinfo-ID: asdf:asdf -WARC-Filename: not-yet-tested -WARC-Profile: asdf -WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest -WARC-Identified-Payload-Type: asdf -WARC-Segment-Origin-ID: http://example.com -WARC-Segment-Number: not-an-integer -WARC-Segment-Number: 0 -WARC-Segment-Number: 1 -WARC-Segment-Number: 2 -WARC-Segment-Total-Length: 0 -WARC-Segment-Total-Length: not-an-integer -WARC-Refers-To-Target-URI: http://example.com -WARC-Refers-To-Date: not-a-date -WARC-Unknown-Field: asdf -Content-Length: 0 - - -WARC/1.1 -WARC-Date: 2017-03-06T04:03:53Z -WARC-Date: 2017-03-06T04:03:53.Z -WARC-Date: 2017-03-06T04:03:53.0Z -WARC-Type: invalid -Content-Length: 0 - - -WARC/1.1 -WARC-Type: request -WARC-Segment-Number: 1 -Content-Length: 0 - - -WARC/invalid +WARC/1.0 +WARC-Target-URI: +WARC-Target-URI: example.com +WARC-Target-URI: ex ample.com +WARC-Target-URI: h<>ttp://example.com/ +WARC-Type: does-not-exist +WARC-Type: CAPITALIZED +WARC-Concurrent-To: http://example.com/ +WARC-Record-ID: +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +Content-Type: asdf +Content-Type: has space/asdf +Content-Type: asdf/has space +Content-Type: asdf/has space;asdf +WARC-Block-Digest: asdf +WARC-Block-Digest: has space:asdf +WARC-Block-Digest: sha1:&$*^&*^#*&^ +WARC-IP-Address: 1.2.3.4.5 +WARC-Truncated: invalid +WARC-Warcinfo-ID: asdf:asdf +WARC-Filename: not-yet-tested +WARC-Profile: asdf +WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest +WARC-Identified-Payload-Type: asdf +WARC-Segment-Origin-ID: http://example.com +WARC-Segment-Number: not-an-integer +WARC-Segment-Number: 0 +WARC-Segment-Number: 1 +WARC-Segment-Number: 2 +WARC-Segment-Total-Length: 0 +WARC-Segment-Total-Length: not-an-integer +WARC-Refers-To-Target-URI: http://example.com +WARC-Refers-To-Date: not-a-date +WARC-Unknown-Field: asdf +Content-Length: 0 + + +WARC/1.1 +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +WARC-Date: 2017-03-06T04:03:53.0Z +WARC-Type: invalid +Content-Length: 0 + + +WARC/1.1 +WARC-Type: request +WARC-Segment-Number: 1 +Content-Length: 0 + + +WARC/invalid diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index fa03b38e..da6a2aaf 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -15,7 +15,7 @@ token cannot have a space: WARC/1.0 -WARC-Record-ID: test-empty-warc-fields +WARC-Record-ID: WARC-Type: warcinfo Content-Type: application/warc-fields Content-Length: 0 @@ -23,7 +23,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: warcinfo -WARC-Record-ID: test-warcinfo-non-recommended-content-type +WARC-Record-ID: Content-Type: not-application/warc-fields Content-Length: 5 @@ -32,7 +32,7 @@ foo WARC/1.0 WARC-Type: response -WARC-Record-ID: test-response-content-type +WARC-Record-ID: WARC-Target-URI: HtTp://example.com/ Content-Type: text/plain Content-Length: 0 @@ -40,7 +40,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource -WARC-Record-ID: test-resource-dns-content-type +WARC-Record-ID: WARC-Target-URI: DnS:asdfasdf Content-Type: text/plain Content-Length: 0 @@ -48,7 +48,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource -WARC-Record-ID: test-resource-dns-empty +WARC-Record-ID: WARC-Test-TODO: add another with valid block WARC-Target-URI: DnS:asdfasdf Content-Type: text/dns @@ -57,14 +57,14 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource -WARC-Record-ID: test-resource-not-dns +WARC-Record-ID: WARC-Target-URI: foo:bar Content-Length: 0 WARC/1.0 WARC-Type: request -WARC-Record-ID: test-request-unrecommended-content-type +WARC-Record-ID: WARC-Target-URI: hTtP://example.com/ Content-Type: text/plain Content-Length: 0 @@ -72,7 +72,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: request -WARC-Record-ID: test-request-unrecommended-content-type-with-ip +WARC-Record-ID: WARC-Target-URI: hTtP://example.com/ WARC-IP-Address: 1.2.3.4 Content-Type: text/plain @@ -81,55 +81,55 @@ Content-Length: 0 WARC/1.0 WARC-Type: metadata -WARC-Record-ID: test-metadata-warc-fields-empty +WARC-Record-ID: Content-Type: application/warc-fields Content-Length: 0 WARC/1.0 WARC-Type: metadata -WARC-Record-ID: test-metadata-not-warc-fields +WARC-Record-ID: Content-Type: not-application/warc-fields Content-Length: 0 WARC/1.0 WARC-Type: revisit -WARC-Record-ID: test-revisit-profile-unknown +WARC-Record-ID: WARC-Profile: none Content-Length: 0 WARC/1.0 WARC-Type: revisit -WARC-Record-ID: test-revisit-profile-future +WARC-Record-ID: WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest Content-Length: 0 WARC/1.0 WARC-Type: revisit -WARC-Record-ID: test-revisit-profile-good +WARC-Record-ID: WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified Content-Length: 0 WARC/1.0 WARC-Type: conversion -WARC-Record-ID: test-conversion +WARC-Record-ID: Content-Length: 0 WARC/1.0 WARC-Type: continuation -WARC-Record-ID: test-continuation-segment-1 +WARC-Record-ID: WARC-Segment-Number: 1 Content-Length: 0 WARC/1.0 WARC-Type: continuation -WARC-Record-ID: test-continuation-segment-valid +WARC-Record-ID: WARC-Segment-Number: 2 Content-Length: 0 diff --git a/test/test_tests.py b/test/test_tests.py index 91eba656..c08a19f6 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -51,80 +51,68 @@ def test_torture_validate_record(): comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n comment: Missing colon in warc-fields line: no colon comment: Invalid warc-fields name: token cannot have a space - WARC-Record-ID test-empty-warc-fields + WARC-Record-ID WARC-Type warcinfo digest not present - error: uri must be within <>: WARC-Record-ID test-empty-warc-fields error: missing required header: WARC-Date comment: warc-fields body present but empty - WARC-Record-ID test-warcinfo-non-recommended-content-type + WARC-Record-ID WARC-Type warcinfo digest not present - error: uri must be within <>: WARC-Record-ID test-warcinfo-non-recommended-content-type error: missing required header: WARC-Date - recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw: not-application/warc-fields - WARC-Record-ID test-response-content-type + recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields + WARC-Record-ID WARC-Type response digest not present - error: uri must be within <>: WARC-Record-ID test-response-content-type error: missing required header: WARC-Date - error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw: text/plain + error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain error: WARC-IP-Address should be used for http and https responses - WARC-Record-ID test-resource-dns-content-type + WARC-Record-ID WARC-Type resource digest not present - error: uri must be within <>: WARC-Record-ID test-resource-dns-content-type error: missing required header: WARC-Date - error: recource records for dns: shall have Content-Type of text/dns, saw: text/plain - WARC-Record-ID test-resource-dns-empty + error: resource records for dns shall have Content-Type of text/dns: text/plain + WARC-Record-ID WARC-Type resource digest not present - error: uri must be within <>: WARC-Record-ID test-resource-dns-empty error: missing required header: WARC-Date comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block - WARC-Record-ID test-resource-not-dns + WARC-Record-ID WARC-Type resource digest not present - error: uri must be within <>: WARC-Record-ID test-resource-not-dns error: missing required header: Content-Type error: missing required header: WARC-Date - WARC-Record-ID test-request-unrecommended-content-type + WARC-Record-ID WARC-Type request digest not present - error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type error: missing required header: WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain error: WARC-IP-Address should be used for http and https requests - WARC-Record-ID test-request-unrecommended-content-type-with-ip + WARC-Record-ID WARC-Type request digest not present - error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type-with-ip error: missing required header: WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain - WARC-Record-ID test-metadata-warc-fields-empty + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain + WARC-Record-ID WARC-Type metadata digest not present - error: uri must be within <>: WARC-Record-ID test-metadata-warc-fields-empty error: missing required header: WARC-Date comment: warc-fields body present but empty - WARC-Record-ID test-metadata-not-warc-fields + WARC-Record-ID WARC-Type metadata digest not present - error: uri must be within <>: WARC-Record-ID test-metadata-not-warc-fields error: missing required header: WARC-Date - WARC-Record-ID test-revisit-profile-unknown + WARC-Record-ID WARC-Type revisit digest not present - error: uri must be within <>: WARC-Record-ID test-revisit-profile-unknown error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Target-URI comment: extension seen: WARC-Profile none comment: no revisit details validation done due to unknown profile: none - WARC-Record-ID test-revisit-profile-future + WARC-Record-ID WARC-Type revisit digest not present - error: uri must be within <>: WARC-Record-ID test-revisit-profile-future error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Target-URI @@ -133,34 +121,30 @@ def test_torture_validate_record(): recommendation: missing recommended header: WARC-Refers-To-Date recommendation: missing recommended header: WARC-Refers-To-Target-URI comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest - WARC-Record-ID test-revisit-profile-good + WARC-Record-ID WARC-Type revisit digest not present - error: uri must be within <>: WARC-Record-ID test-revisit-profile-good error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Target-URI recommendation: missing recommended header: WARC-Refers-To recommendation: missing recommended header: WARC-Refers-To-Date - WARC-Record-ID test-conversion + WARC-Record-ID WARC-Type conversion digest not present - error: uri must be within <>: WARC-Record-ID test-conversion error: missing required header: WARC-Date error: missing required header: WARC-Target-URI - WARC-Record-ID test-continuation-segment-1 + WARC-Record-ID WARC-Type continuation digest not present - error: uri must be within <>: WARC-Record-ID test-continuation-segment-1 error: missing required header: WARC-Date error: missing required header: WARC-Segment-Origin-ID error: missing required header: WARC-Target-URI - error: continuation record must have WARC-Segment-Number > 1, saw: 1 + error: continuation record must have WARC-Segment-Number > 1: 1 comment: warcio test continuation code has not been tested, expect bugs - WARC-Record-ID test-continuation-segment-valid + WARC-Record-ID WARC-Type continuation digest not present - error: uri must be within <>: WARC-Record-ID test-continuation-segment-valid error: missing required header: WARC-Date error: missing required header: WARC-Segment-Origin-ID error: missing required header: WARC-Target-URI @@ -184,7 +168,7 @@ def test_torture_validate_field(): expected = """\ test/data/standard-torture-validate-field.warc - WARC-Record-ID + WARC-Record-ID WARC-Type does-not-exist unknown hash algorithm name in block digest error: uri must not be within <>: WARC-Target-URI diff --git a/warcio/tester.py b/warcio/tester.py index 4ee05f1f..023cdb29 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -126,7 +126,7 @@ def validate_warc_fields(record, commentary): def validate_warcinfo(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': - commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw:', content_type) + commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type) else: # format: warc-fields # allowable fields include but not limited to DMCI plus the following @@ -147,7 +147,7 @@ def validate_response(record, commentary, pending): if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type', 'none') if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}: - commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw:', content_type) + commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') @@ -164,7 +164,7 @@ def validate_resource(record, commentary, pending): if target_uri.startswith('dns:'): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'text/dns': - commentary.error('recource records for dns: shall have Content-Type of text/dns, saw:', content_type) + commentary.error('resource records for dns shall have Content-Type of text/dns:', content_type) else: # rfc 2540 and rfc 1035 #validate_text_dns() @@ -180,7 +180,7 @@ def validate_request(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type') if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}: - commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw:', content_type) + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https requests') @@ -240,12 +240,12 @@ def validate_continuation(record, commentary, pending): segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') if segment_number.isdigit() and int(segment_number) < 2: - commentary.error('continuation record must have WARC-Segment-Number > 1, saw:', segment_number) + commentary.error('continuation record must have WARC-Segment-Number > 1:', segment_number) # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated -def validate_actual_uri(field, value, record, version, commentary, pending): +def validate_unbracketed_uri(field, value, record, version, commentary, pending): # uri per RFC 3986 # should use a registered scheme # %XX encoding, normalize to upper case @@ -272,16 +272,16 @@ def validate_warc_type(field, value, record, version, commentary, pending): commentary.comment('unknown WARC-Type:', field, value) -def validate_uri(field, value, record, version, commentary, pending): +def validate_bracketed_uri(field, value, record, version, commentary, pending): # < uri > if not (value.startswith('<') and value.endswith('>')): commentary.error('uri must be within <>:', field, value) return - validate_actual_uri(field, value[1:-1], record, version, commentary, pending) + validate_unbracketed_uri(field, value[1:-1], record, version, commentary, pending) def validate_record_id(field, value, record, version, commentary, pending): - validate_uri(field, value, record, version, commentary, pending) + validate_bracketed_uri(field, value, record, version, commentary, pending) # TODO: should be "globally unique for its period of intended use" @@ -379,7 +379,7 @@ def validate_truncated(field, value, record, version, commentary, pending): def validate_warcinfo_id(field, value, record, version, commentary, pending): - validate_uri(field, value, record, version, commentary, pending) + validate_bracketed_uri(field, value, record, version, commentary, pending) # TODO: should point at a warcinfo record @@ -446,7 +446,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_content_type, }, 'WARC-Concurrent-To': { - 'validate': validate_uri, + 'validate': validate_bracketed_uri, }, 'WARC-Block-Digest': { 'validate': validate_digest, @@ -458,10 +458,10 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_ip, }, 'WARC-Refers-To': { - 'validate': validate_uri, + 'validate': validate_bracketed_uri, }, 'WARC-Target-URI': { - 'validate': validate_actual_uri, + 'validate': validate_unbracketed_uri, }, 'WARC-Truncated': { 'validate': validate_truncated, @@ -479,7 +479,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_content_type, }, 'WARC-Segment-Origin-ID': { - 'validate': validate_uri, + 'validate': validate_bracketed_uri, }, 'WARC-Segment-Number': { 'validate': validate_segment_number, @@ -488,7 +488,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_segment_total_length, }, 'WARC-Refers-To-Target-URI': { - 'validate': validate_actual_uri, + 'validate': validate_unbracketed_uri, 'minver': '1.1', }, 'WARC-Refers-To-Date': { From 2a10b23aafa5f7023dea11b7a41b8f6f0525331e Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 13:31:40 -0800 Subject: [PATCH 19/68] warc-segment-number cleaner recommendation --- test/test_tests.py | 1 - warcio/tester.py | 12 ++++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index c08a19f6..dcbc3666 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -237,7 +237,6 @@ def test_torture_validate_field(): error: missing required header: WARC-Record-ID error: missing required header: WARC-Target-URI recommendation: do not segment WARC-Type request - comment: Unknown field for this record type, perhaps an extension: request WARC-Segment-Number """ value = helper(args, 0) diff --git a/warcio/tester.py b/warcio/tester.py index 023cdb29..6346754d 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -503,20 +503,21 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'], 'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'], + 'ignored': ['WARC-Segment-Number'], 'validate': validate_warcinfo, }, 'response': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type', 'WARC-Target-URI'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_response, }, 'resource': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_resource, }, @@ -526,6 +527,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'ignored': ['WARC-Segment-Number'], 'validate': validate_request, }, 'metadata': { @@ -534,6 +536,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'], 'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'ignored': ['WARC-Segment-Number'], 'validate': validate_metadata, }, 'revisit': { @@ -542,11 +545,12 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID', # normal optionals 'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], # these are for profiles 'prohibited': ['WARC-Filename'], + 'ignored': ['WARC-Segment-Number'], 'validate': validate_revisit, }, 'conversion': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], - 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number'], 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_conversion, }, @@ -574,7 +578,7 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, for rec in sorted(config.get('recommended', [])): if not rec_headers.get_header(rec): commentary.recommendation('missing recommended header:', rec) - allowed = make_header_set(config, ('required', 'optional', 'recommended')) + allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored')) prohibited = make_header_set(config, ('prohibited',)) for field, value in rec_headers.headers: From 81c9f0a4e96b82dbb8181531239fa00231b33887 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 13:55:54 -0800 Subject: [PATCH 20/68] segment origin id --- test/test_tests.py | 1 + warcio/tester.py | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index dcbc3666..598ba49b 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -232,6 +232,7 @@ def test_torture_validate_field(): WARC-Record-ID None WARC-Type request digest not present + error: segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Record-ID diff --git a/warcio/tester.py b/warcio/tester.py index 6346754d..632de060 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -420,6 +420,9 @@ def validate_segment_number(field, value, record, version, commentary, pending): if rec_type != 'continuation': if iv != 1: commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value) + origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID') + if origin_id is None: + commentary.error('segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID') if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: commentary.recommendation('do not segment WARC-Type', rec_type) @@ -503,21 +506,21 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'], 'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'], - 'ignored': ['WARC-Segment-Number'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'validate': validate_warcinfo, }, 'response': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type', 'WARC-Target-URI'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_response, }, 'resource': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_resource, }, @@ -527,7 +530,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], - 'ignored': ['WARC-Segment-Number'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'validate': validate_request, }, 'metadata': { @@ -536,7 +539,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'], 'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], - 'ignored': ['WARC-Segment-Number'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'validate': validate_metadata, }, 'revisit': { @@ -545,18 +548,18 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID', # normal optionals 'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], # these are for profiles 'prohibited': ['WARC-Filename'], - 'ignored': ['WARC-Segment-Number'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'validate': validate_revisit, }, 'conversion': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], - 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_conversion, }, 'continuation': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', - 'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'], + 'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'], 'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'], 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_continuation, @@ -587,8 +590,8 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, commentary.error('field not allowed in record type:', rec_type, field) elif allow_all or fl in allowed: pass - elif fl in warc_fields: - commentary.comment('Unknown field for this record type, perhaps an extension:', rec_type, field) + elif fl in warc_fields: # pragma: no cover (this is a configuration error, if it happens) + commentary.comment('Known field, but not expected for this record type:', rec_type, field) else: # an 'unknown field' comment has already been issued in validate_record pass From c78343a166264f7df0889865d261a768c46fe5ad Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 14:51:19 -0800 Subject: [PATCH 21/68] timestamp checking --- test/test_tests.py | 6 ++++-- warcio/tester.py | 21 ++++++++------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index 598ba49b..89851eca 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -184,7 +184,8 @@ def test_torture_validate_field(): error: duplicate field seen: WARC-Type CAPITALIZED error: uri must be within <>: WARC-Concurrent-To http://example.com/ error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z - error: WARC 1.0 time may not have fractional seconds: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z + error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z error: must contain a /: Content-Type asdf error: invalid subtype: Content-Type asdf error: duplicate field seen: Content-Type has space/asdf @@ -212,6 +213,7 @@ def test_torture_validate_field(): error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 error: duplicate field seen: WARC-Segment-Total-Length not-an-integer error: must be an integer: WARC-Segment-Total-Length not-an-integer + error: Invalid timestamp: WARC-Refers-To-Date not-a-date comment: unknown WARC-Type: WARC-Type does-not-exist comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED comment: unknown WARC-Type: WARC-Type CAPITALIZED @@ -226,7 +228,7 @@ def test_torture_validate_field(): WARC-Type invalid digest not present error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z - error: fractional seconds must have 1-9 digits: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z comment: unknown WARC-Type: WARC-Type invalid WARC-Record-ID None diff --git a/warcio/tester.py b/warcio/tester.py index 632de060..5396ff3b 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -286,21 +286,16 @@ def validate_record_id(field, value, record, version, commentary, pending): def validate_timestamp(field, value, record, version, commentary, pending): - use_ms = False if version == '1.0' else True - if not use_ms: - if '.' in value: - # XXX specification infelicity: would be nice to have 'advice to implementers' here - commentary.error('WARC 1.0 time may not have fractional seconds:', field, value) - else: - if '.' in value: - start, end = value.split('.', 1) - if not re.search(r'\A[0-9]{1,9}Z\Z', end): - commentary.error('fractional seconds must have 1-9 digits:', field, value) + ISO_RE = r'\A\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:.\d{1,9})?Z\Z' - # XXX the above is pretty incomplete for dash, colon, trailing Z, etc + if not re.match(ISO_RE, value): + commentary.error('Invalid timestamp:', field, value) - # TODO: "multiple records written as part of a single capture event shall use the same WARC-Date" - # how? follow WARC-Concurrent-To pointer(s) from request to response(s) + use_ms = False if version <= '1.0' else True + if not use_ms: + if '.' in value: + # specification infelicity: would be nice to have 'advice to implementers' here + commentary.error('WARC versions <= 1.0 may not have timestamps with fractional seconds:', field, value) def validate_content_length(field, value, record, version, commentary, pending): From efe0fdab9178f440b4c6682482ca6110e73d020e Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 16:46:59 -0800 Subject: [PATCH 22/68] buglet --- test/data/standard-torture-validate-field.warc | 1 + warcio/tester.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc index 816413be..126ba964 100644 --- a/test/data/standard-torture-validate-field.warc +++ b/test/data/standard-torture-validate-field.warc @@ -6,6 +6,7 @@ WARC-Target-URI: h<>ttp://example.com/ WARC-Type: does-not-exist WARC-Type: CAPITALIZED WARC-Concurrent-To: http://example.com/ +WARC-Concurrent-To: WARC-Record-ID: WARC-Date: 2017-03-06T04:03:53Z WARC-Date: 2017-03-06T04:03:53.Z diff --git a/warcio/tester.py b/warcio/tester.py index 5396ff3b..8e9d8da3 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -608,7 +608,7 @@ def validate_record(record): seen_fields = set() for field, value in record.rec_headers.headers: field_l = field.lower() - if field != 'warc-concurrent-to' and field_l in seen_fields: + if field_l != 'warc-concurrent-to' and field_l in seen_fields: commentary.error('duplicate field seen:', field, value) seen_fields.add(field_l) if field_l not in warc_fields: From 7a2664405e5e9246147bb23ecf6e6bdc813e0db3 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Tue, 29 Jan 2019 17:52:05 -0800 Subject: [PATCH 23/68] global checks --- .../data/standard-torture-validate-field.warc | 2 + test/test_tests.py | 51 +++- warcio/tester.py | 278 +++++++++++++++--- 3 files changed, 276 insertions(+), 55 deletions(-) diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc index 126ba964..a928a4c4 100644 --- a/test/data/standard-torture-validate-field.warc +++ b/test/data/standard-torture-validate-field.warc @@ -33,6 +33,8 @@ WARC-Segment-Total-Length: 0 WARC-Segment-Total-Length: not-an-integer WARC-Refers-To-Target-URI: http://example.com WARC-Refers-To-Date: not-a-date +WARC-Refers-To-Filename: asdf +WARC-Refers-To-File-Offset: 1234 WARC-Unknown-Field: asdf Content-Length: 0 diff --git a/test/test_tests.py b/test/test_tests.py index 89851eca..ebbdb509 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -6,6 +6,14 @@ from .test_cli import patch_stdout +file_map = {} + + +def map_test_file(filename): + file_map[filename] = get_test_file(filename) + return file_map[filename] + + def helper(args, expected_exit_value): with patch_stdout() as buff: exit_value = None @@ -22,17 +30,16 @@ def helper(args, expected_exit_value): def remove_before_test_data(s): ret = '' for line in s.splitlines(True): - if '/test/data/' in line: - line = 'test/data/' + line.split('/test/data/', 1)[1] - if '\\test\\data\\' in line: - line = 'test/data/' + line.split('\\test\\data\\', 1)[1] + for filename, value in file_map.items(): + if value in line: + line = line.replace(value, 'test/data/' + filename) ret += line return ret def test_torture_validate_record(): files = ['standard-torture-validate-record.warc'] - files = [get_test_file(filename) for filename in files] + files = [map_test_file(filename) for filename in files] args = ['test'] args.extend(files) @@ -55,7 +62,7 @@ def test_torture_validate_record(): WARC-Type warcinfo digest not present error: missing required header: WARC-Date - comment: warc-fields body present but empty + comment: warc-fields block present but empty WARC-Record-ID WARC-Type warcinfo digest not present @@ -67,6 +74,7 @@ def test_torture_validate_record(): error: missing required header: WARC-Date error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain error: WARC-IP-Address should be used for http and https responses + error: http/https responses should have http headers WARC-Record-ID WARC-Type resource digest not present @@ -97,7 +105,7 @@ def test_torture_validate_record(): WARC-Type metadata digest not present error: missing required header: WARC-Date - comment: warc-fields body present but empty + comment: warc-fields block present but empty WARC-Record-ID WARC-Type metadata digest not present @@ -108,7 +116,7 @@ def test_torture_validate_record(): error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Target-URI - comment: extension seen: WARC-Profile none + comment: unknown value, perhaps an extension: WARC-Profile none comment: no revisit details validation done due to unknown profile: none WARC-Record-ID WARC-Type revisit @@ -120,7 +128,7 @@ def test_torture_validate_record(): recommendation: missing recommended header: WARC-Refers-To recommendation: missing recommended header: WARC-Refers-To-Date recommendation: missing recommended header: WARC-Refers-To-Target-URI - comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest WARC-Record-ID WARC-Type revisit digest not present @@ -161,7 +169,7 @@ def test_torture_validate_record(): def test_torture_validate_field(): files = ['standard-torture-validate-field.warc'] - files = [get_test_file(filename) for filename in files] + files = [map_test_file(filename) for filename in files] args = ['test'] args.extend(files) @@ -219,10 +227,12 @@ def test_torture_validate_field(): comment: unknown WARC-Type: WARC-Type CAPITALIZED comment: unknown digest algorithm: WARC-Block-Digest asdf comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ - comment: extension seen: WARC-Truncated invalid - comment: extension seen: WARC-Profile asdf + comment: unknown value, perhaps an extension: WARC-Truncated invalid + comment: unknown value, perhaps an extension: WARC-Profile asdf comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234 comment: unknown field, no validation performed: WARC-Unknown-Field asdf WARC-Record-ID None WARC-Type invalid @@ -240,6 +250,11 @@ def test_torture_validate_field(): error: missing required header: WARC-Record-ID error: missing required header: WARC-Target-URI recommendation: do not segment WARC-Type request +global warcinfo checks + comment: WARC-Warcinfo-ID not found: WARC-Warcinfo-ID asdf:asdf +global Concurrent-To checks + comment: WARC-Concurrent-To not found: WARC-Concurrent-To + comment: WARC-Concurrent-To not found: WARC-Concurrent-To http://example.com/ """ value = helper(args, 0) @@ -251,7 +266,7 @@ def test_torture_validate_field(): def test_arc(): files = ['does-not-exist.arc'] - files = [get_test_file(filename) for filename in files] + files = [map_test_file(filename) for filename in files] args = ['test'] args.extend(files) @@ -267,7 +282,7 @@ def test_arc(): def test_digests(): # needed for test coverage files = ['example-digest-bad.warc', 'example.warc'] - files = [get_test_file(filename) for filename in files] + files = [map_test_file(filename) for filename in files] args = ['test'] args.extend(files) @@ -282,23 +297,28 @@ def test_digests(): WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc WARC-Record-ID WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc WARC-Record-ID WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc test/data/example.warc WARC-Record-ID WARC-Type request digest not present error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: found in files test/data/example.warc test/data/example-digest-bad.warc WARC-Record-ID WARC-Type revisit digest present but not checked recommendation: missing recommended header: WARC-Refers-To + comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z WARC-Record-ID @@ -318,12 +338,11 @@ def test_leftovers(): # hard to test because invalid WARC Content-Length raises in archiveiterator warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) - # hard to test because warcio checks the WARC version + # hard to test because warcio raises for unknown WARC version warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) expected = '''\ error: must be an integer: Content-Length not-an-integer -comment: no profile check because unknown warc version: blah blah ''' assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/warcio/tester.py b/warcio/tester.py index 8e9d8da3..870c7d6e 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -3,14 +3,15 @@ import re import sys import six +from collections import defaultdict from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester from warcio.exceptions import ArchiveLoadFailed -class Commentary: - def __init__(self, record_id, rec_type): +class Commentary(object): + def __init__(self, record_id=None, rec_type=None): self._record_id = record_id self._rec_type = rec_type self.errors = [] @@ -37,6 +38,7 @@ def has_comments(self): return True def comments(self): + # XXX str() all of these, in case an int or other thing slips in? for e in self.errors: yield 'error: ' + ' '.join(e) for r in self.recommendations: @@ -55,6 +57,13 @@ def __getattr__(self, name): if self._content is None: self._content = self.obj.content_stream().read() return self._content + if name == 'stream_for_digest_check': + def _doit(): + while True: + piece = self.obj.content_stream().read(1024*1024) + if len(piece) == 0: + break + return _doit return getattr(self.__dict__['obj'], name) @@ -117,7 +126,7 @@ def validate_warc_fields(record, commentary): first_line = False if not lines: - commentary.comment('warc-fields body present but empty') + commentary.comment('warc-fields block present but empty') return # check known fields @@ -126,6 +135,7 @@ def validate_warc_fields(record, commentary): def validate_warcinfo(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': + # https://github.com/iipc/warc-specifications/issues/33 -- SHALL BE or recommended? commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type) else: # format: warc-fields @@ -137,8 +147,8 @@ def validate_warcinfo(record, commentary, pending): validate_warc_fields(record, commentary) # whole-file tests: - # optional that warcinfo be first in file, still deserves a comment - # allowable for warcinfo to appear anywhere + # recommended that all files start with warcinfo + # elsewise allowable for warcinfo to appear anywhere def validate_response(record, commentary, pending): @@ -152,10 +162,32 @@ def validate_response(record, commentary, pending): if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') - # error: http and https schemes should have http response headers - # test by attempting to parse them? + if not record.http_headers: + commentary.error('http/https responses should have http headers') + return - # comment: verify http content-length, if present -- commoncrawl nutch bug + http_content_length = record.http_headers.get_header('Content-Length') + if http_content_length is None: + return + + if not http_content_length.isdigit(): + commentary.comment('http content length header is not an integer', str(http_content_length)) + return + + # We want to verify http_content_length, which is the size of the compressed payload + # Trying to catch that commoncrawl nutch bug that prefixed /r/n to the payload without changing http content-length + + # this blecherous hack is because we need the length of the (possibly compressed) raw stream + # without reading any of it (so that it can be read elsewhere to check the payload digest) + + # XXX fix me before shipping :-D + + if hasattr(record, 'raw_stream'): + if hasattr(record.raw_stream, 'stream'): + if hasattr(record.raw_stream.stream, 'limit'): + if int(http_content_length) != record.raw_stream.stream.limit: + commentary.comment('Actual http payload length is different from http header Content-Length:', + str(record.raw_stream.stream.limit), http_content_length) def validate_resource(record, commentary, pending): @@ -171,6 +203,7 @@ def validate_resource(record, commentary, pending): pass # should never have http headers + # heuristic of looking for an http status line? and then a blank line?! def validate_request(record, commentary, pending): @@ -193,6 +226,8 @@ def validate_request(record, commentary, pending): def validate_metadata(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() == 'application/warc-fields': + # https://github.com/iipc/warc-specifications/issues/33 SHALL be or not? + # # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it # hopsFromSeed: string @@ -206,8 +241,11 @@ def validate_revisit(record, commentary, pending): if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): config = { 'required': ['WARC-Payload-Digest'], - 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], + 'recommended': ['WARC-Refers-To'], } + if '/1.1/' in warc_profile: + config['recommended'].extend(('WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date')) + validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) # may have record block; # if not, shall have Content-Length: 0, @@ -282,7 +320,6 @@ def validate_bracketed_uri(field, value, record, version, commentary, pending): def validate_record_id(field, value, record, version, commentary, pending): validate_bracketed_uri(field, value, record, version, commentary, pending) - # TODO: should be "globally unique for its period of intended use" def validate_timestamp(field, value, record, version, commentary, pending): @@ -328,8 +365,6 @@ def validate_content_type(field, value, record, version, commentary, pending): # at this point there can be multiple parameters, # some of which could have quoted string values with ; in them - # TODO: more checking - def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: @@ -370,37 +405,45 @@ def validate_ip(field, value, record, version, commentary, pending): def validate_truncated(field, value, record, version, commentary, pending): if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: - commentary.comment('extension seen:', field, value) + commentary.comment('unknown value, perhaps an extension:', field, value) def validate_warcinfo_id(field, value, record, version, commentary, pending): validate_bracketed_uri(field, value, record, version, commentary, pending) - # TODO: should point at a warcinfo record def validate_filename(field, value, record, version, commentary, pending): - # TODO: text or quoted-string + # text or quoted-string + # comment for dangerous utf-8 in filename? pass profiles = { - # XXX WARC/0.17 and WARC/0.18 + '0.17': ['http://netpreserve.org/warc/0.17/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/0.17/revisit/server-not-modified'], + '0.18': ['http://netpreserve.org/warc/0.18/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/0.18/revisit/server-not-modified'], '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', 'http://netpreserve.org/warc/1.0/revisit/server-not-modified', - # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not? - # https://github.com/iipc/webarchive-commons/commits/988bec707c27a01333becfc3bd502af4441ea1e1/src/main/java/org/archive/format/warc/WARCConstants.java 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'], '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', 'http://netpreserve.org/warc/1.1/revisit/server-not-modified'], } +profiles_rev = dict([(filename, version) for version, filenames in profiles.items() for filename in filenames]) def validate_profile(field, value, record, version, commentary, pending): if version not in profiles: - commentary.comment('no profile check because unknown warc version:', field, value) return - if value not in profiles[version]: - commentary.comment('extension seen:', field, value) + + if value in profiles_rev: + if profiles_rev[value] != version: + commentary.comment('WARC-Profile value is for a different version:', version, value) + else: + commentary.comment('unknown value, perhaps an extension:', field, value) + + if '/revisit/uri-agnostic-identical-payload-digest' in value: + commentary.comment('This Heretrix extension never made it into the standard:', field, value) def validate_segment_number(field, value, record, version, commentary, pending): @@ -427,6 +470,14 @@ def validate_segment_total_length(field, value, record, version, commentary, pen commentary.error('must be an integer:', field, value) +def validate_refers_to_filename(field, value, record, version, commentary, pending): + commentary.comment('This Heretrix extension never made it into the standard:', field, value) + + +def validate_refers_to_file_offset(field, value, record, version, commentary, pending): + commentary.comment('This Heretrix extension never made it into the standard:', field, value) + + warc_fields = { 'WARC-Type': { 'validate': validate_warc_type, @@ -493,6 +544,12 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_timestamp, 'minver': '1.1', }, + 'WARC-Refers-To-Filename': { + 'validate': validate_refers_to_filename, + }, + 'WARC-Refers-To-File-Offset': { + 'validate': validate_refers_to_file_offset, + }, } warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()]) @@ -579,13 +636,13 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored')) prohibited = make_header_set(config, ('prohibited',)) - for field, value in rec_headers.headers: + for field, value in rec_headers.headers: # XXX not exported fl = field.lower() if fl in prohibited: commentary.error('field not allowed in record type:', rec_type, field) elif allow_all or fl in allowed: pass - elif fl in warc_fields: # pragma: no cover (this is a configuration error, if it happens) + elif fl in warc_fields: # pragma: no cover (this is a tester.py configuration omission) commentary.comment('Known field, but not expected for this record type:', rec_type, field) else: # an 'unknown field' comment has already been issued in validate_record @@ -598,15 +655,15 @@ def validate_record_against_rec_type(config, record, commentary, pending): def validate_record(record): - version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported? + version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported record_id = record.rec_headers.get_header('WARC-Record-ID') rec_type = record.rec_headers.get_header('WARC-Type') - commentary = Commentary(record_id, rec_type) + commentary = Commentary(record_id=record_id, rec_type=rec_type) pending = None seen_fields = set() - for field, value in record.rec_headers.headers: + for field, value in record.rec_headers.headers: # XXX not exported field_l = field.lower() if field_l != 'warc-concurrent-to' and field_l in seen_fields: commentary.error('duplicate field seen:', field, value) @@ -617,13 +674,13 @@ def validate_record(record): config = warc_fields[field_l] if 'minver' in config: if version < config['minver']: - # unknown fields are extensions, so this is a comment and not an error commentary.comment('field was introduced after this warc version:', version, field, value) if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) if rec_type not in record_types: - pass # we print a comment for this elsewhere + # we print a comment for this elsewhere + pass else: validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary) validate_record_against_rec_type(record_types[rec_type], record, commentary, pending) @@ -631,10 +688,149 @@ def validate_record(record): return commentary -def _process_one(warc): - if warc.endswith('.arc') or warc.endswith('.arc.gz'): +def save_global_info(record, warcfile, commentary, all_records, concurrent_to): + record_id = record.rec_headers.get_header('WARC-Record-ID') + if record_id is None: return - with open(warc, 'rb') as stream: + + for field, value in record.rec_headers.headers: # XXX not exported + if field.lower() == 'warc-concurrent-to': + if record_id is not None and value is not None: + concurrent_to[record_id].append(value) + concurrent_to[value].append(record_id) + + save = {'warcfile': warcfile} + + saved_fields = ( + 'WARC-Type', 'WARC-Warcinfo-ID', 'WARC-Date' + 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Payload-Digest', 'WARC-Target-URI', + 'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Segment-Total-Length', 'WARC-Truncated' + ) + saved_fields = set([x.lower() for x in saved_fields]) + + for field, value in record.rec_headers.headers: # XXX not exported + field_l = field.lower() + if field_l in saved_fields and value is not None: + save[field_l] = value + if field_l == 'warc-concurrent-to': + if 'warc-concurrent-to' not in save: + save['warc-concurrent-to'] = [] + save['warc-concurrent-to'].append(value) + + if record_id in all_records: + commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile']) + else: + all_records[record_id] = save + + +def check_global(all_records, concurrent_to): + check_global_warcinfo(all_records) + check_global_concurrent_to(all_records, concurrent_to) + check_global_refers_to(all_records) + check_global_segment(all_records) + + +def _print_global(header, commentary): + if commentary.has_comments(): + print(header) + for c in commentary.comments(): + print(' ', c) + + +def check_global_warcinfo(all_records): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-warcinfo-id' in fields: + wanted_id = fields['warc-warcinfo-id'] + if wanted_id not in all_records or all_records[wanted_id]['warc-type'] != 'warcinfo': + commentary.comment('WARC-Warcinfo-ID not found:', record_id, 'WARC-Warcinfo-ID', wanted_id) + + _print_global('global warcinfo checks', commentary) + + +def check_global_concurrent_to(all_records, concurrent_to): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-concurrent-to' in fields: + whole_set = set(fields['warc-concurrent-to']) + del fields['warc-concurrent-to'] + while True: + current_set = list(whole_set) + for c in current_set: + if c in all_records and 'warc-concurrent-to' in all_records[c]: + whole_set.update(set(all_records[c]['warc-concurrent-to'])) + del all_records[c]['warc-concurrent-to'] + if len(whole_set) == len(current_set): + break + warc_date = fields.get('warc-date') + for wanted_id in sorted(whole_set): + if wanted_id not in all_records: + commentary.comment('WARC-Concurrent-To not found:', record_id, 'WARC-Concurrent-To', wanted_id) + else: + new_date = all_records[wanted_id].get('warc-date') + if warc_date != new_date: + commentary.comment('WARC-Concurrent-To set has conflicting dates:', + record_id, warc_date, wanted_id, new_date) + + _print_global('global Concurrent-To checks', commentary) + + +def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, target_field, commentary): + if source_field.lower() not in fields: + return + + if target_field.lower() not in all_records[wanted_id]: + commentary.comment('revisit target lacks field:', wanted_id, target_field) + return + + source_value = fields[source_field.lower()] + target_value = all_records[wanted_id][target_field.lower()] + if source_value != target_value: + commentary.comment('revisit and revisit target disagree:', + record_id, source_field, source_value, + wanted_id, target_field, target_value) + + +def check_global_refers_to(all_records): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-refers-to' not in fields: + continue + + wanted_id = fields['warc-refers-to'] + if wanted_id not in all_records: + commentary.comment('WARC-Refers-To target not found:', record_id, 'Warc-Refers-To', wanted_id) + continue + + rec_type = fields.get('warc-type') + if rec_type != 'revisit': + continue + + _revisit_compare(record_id, fields, 'WARC-Refers-To-Target-URI', + wanted_id, all_records, 'WARC-Target-URI', commentary) + _revisit_compare(record_id, fields, 'WARC-Refers-To-Date', + wanted_id, all_records, 'WARC-Date', commentary) + _revisit_compare(record_id, fields, 'WARC-Payload-Digest', + wanted_id, all_records, 'WARC-Payload-Digest', commentary) + + _print_global('global Refers-To checks', commentary) + + +def check_global_segment(all_records): + # warc-segment-origin-id :: exists, is warc-segment-number 1 + # all segments exist, and the last one has WARC-Segment-Total-Length + # and only the last one has WARC-Truncated, if any + + # Segmentation shall not be used if a record can be stored in an existing warc file + # The origin segment shall be placed in a new warc file preceded only by a warcinfo record (if any) + + pass + + +def _process_one(warcfile, all_records, concurrent_to): + if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): + return + with open(warcfile, 'rb') as stream: for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): record = WrapRecord(record) @@ -642,10 +838,9 @@ def _process_one(warc): record.rec_headers.get_header('WARC-Block-Digest')) commentary = validate_record(record) + save_global_info(record, warcfile, commentary, all_records, concurrent_to) - record.content # make sure digests are checked - # XXX might need to read and digest the raw stream to check digests for chunked encoding? - # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes + record.stream_for_digest_check() if commentary.has_comments() or record.digest_checker.passed is False: print(' ', 'WARC-Record-ID', commentary.record_id()) @@ -671,16 +866,21 @@ class Tester(object): def __init__(self, cmd): self.inputs = cmd.inputs self.exit_value = 0 + self.all_records = defaultdict(dict) + self.concurrent_to = defaultdict(list) def process_all(self): - for warc in self.inputs: - print(warc) + for warcfile in self.inputs: + print(warcfile) try: - self.process_one(warc) + self.process_one(warcfile) except ArchiveLoadFailed as e: print(' saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr) print(' skipping rest of file', file=sys.stderr) + + check_global(self.all_records, self.concurrent_to) + return self.exit_value - def process_one(self, filename): - _process_one(filename) + def process_one(self, warcfile): + _process_one(warcfile, self.all_records, self.concurrent_to) From 1d6fd9d070e76e800be6821dfa5f9ec9881f3d7b Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 31 Jan 2019 12:03:44 -0800 Subject: [PATCH 24/68] check -v; capitalize most commentary --- warcio/cli.py | 1 + warcio/tester.py | 89 +++++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/warcio/cli.py b/warcio/cli.py index 7e40cdad..88f3445a 100644 --- a/warcio/cli.py +++ b/warcio/cli.py @@ -57,6 +57,7 @@ def main(args=None): test = subparsers.add_parser('test', help='WARC standards tester') test.add_argument('inputs', nargs='+') + test.add_argument('-v', '--verbose', action='store_true') test.set_defaults(func=tester) cmd = parser.parse_args(args=args) diff --git a/warcio/tester.py b/warcio/tester.py index 870c7d6e..9605ea7b 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -157,7 +157,7 @@ def validate_response(record, commentary, pending): if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type', 'none') if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}: - commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type) + commentary.error('Responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') @@ -264,7 +264,7 @@ def validate_revisit(record, commentary, pending): # if yes, should be like a response record, truncated if desired # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present else: - commentary.comment('no revisit details validation done due to unknown profile:', warc_profile) + commentary.comment('No revisit details validation done due to unknown profile:', warc_profile) def validate_conversion(record, commentary, pending): @@ -291,14 +291,17 @@ def validate_unbracketed_uri(field, value, record, version, commentary, pending) if value.startswith('<') or value.endswith('>'): # wget 1.19 bug caused by WARC 1.0 spec error commentary.error('uri must not be within <>:', field, value) + value = value[1:-1] + + scheme = value.split(':', 1)[0] if ':' not in value: - commentary.error('invalid uri, no scheme:', field, value) + commentary.error('Invalid uri, no scheme:', field, value) + elif not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): + commentary.error('Invalid uri scheme, bad character:', field, value) + # use https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml ?? + if re.search(r'\s', value): - commentary.error('invalid uri, contains whitespace:', field, value) - scheme = value.split(':', 1)[0] - if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): - commentary.error('invalid uri scheme, bad character:', field, value) - # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + commentary.error('Invalid uri, contains whitespace:', field, value) def validate_warc_type(field, value, record, version, commentary, pending): @@ -307,7 +310,7 @@ def validate_warc_type(field, value, record, version, commentary, pending): commentary.comment('WARC-Type is not lower-case:', field, value) if value.lower() not in record_types: # standard says readers should ignore unknown warc-types - commentary.comment('unknown WARC-Type:', field, value) + commentary.comment('Unknown WARC-Type:', field, value) def validate_bracketed_uri(field, value, record, version, commentary, pending): @@ -337,7 +340,7 @@ def validate_timestamp(field, value, record, version, commentary, pending): def validate_content_length(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer:', field, value) + commentary.error('Must be an integer:', field, value) token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z' @@ -346,7 +349,7 @@ def validate_content_length(field, value, record, version, commentary, pending): def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: - commentary.error('must contain a /:', field, value) + commentary.error('Must contain a /:', field, value) splits = value.split('/', 1) ctype = splits[0] if len(splits) > 1: @@ -354,13 +357,13 @@ def validate_content_type(field, value, record, version, commentary, pending): else: rest = '' if not re.search(token_re, ctype): - commentary.error('invalid type:', field, value) + commentary.error('Invalid type:', field, value) if ';' in rest: subtype, rest = rest.split(';', 1) else: subtype = rest if not re.search(token_re, subtype): - commentary.error('invalid subtype:', field, value) + commentary.error('Invalid subtype:', field, value) # at this point there can be multiple parameters, # some of which could have quoted string values with ; in them @@ -368,7 +371,7 @@ def validate_content_type(field, value, record, version, commentary, pending): def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: - commentary.error('missing algorithm:', field, value) + commentary.error('Missing algorithm:', field, value) splits = value.split(':', 1) algorithm = splits[0] if len(splits) > 1: @@ -376,12 +379,12 @@ def validate_digest(field, value, record, version, commentary, pending): else: digest = 'none' if not re.search(token_re, algorithm): - commentary.error('invalid algorithm:', field, value) + commentary.error('Invalid algorithm:', field, value) else: try: Digester(algorithm) except ValueError: - commentary.comment('unknown digest algorithm:', field, value) + commentary.comment('Unknown digest algorithm:', field, value) if not re.search(token_re, digest): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) @@ -398,14 +401,14 @@ def validate_ip(field, value, record, version, commentary, pending): value = unicode(value) ipaddress.ip_address(value) except ValueError: - commentary.error('invalid ip:', field, value) + commentary.error('Invalid ip:', field, value) except (ImportError, NameError): # pragma: no cover - commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') + commentary.comment('Did not check ip address format, install ipaddress module from pypi if you care') def validate_truncated(field, value, record, version, commentary, pending): if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: - commentary.comment('unknown value, perhaps an extension:', field, value) + commentary.comment('Unknown value, perhaps an extension:', field, value) def validate_warcinfo_id(field, value, record, version, commentary, pending): @@ -440,7 +443,7 @@ def validate_profile(field, value, record, version, commentary, pending): if profiles_rev[value] != version: commentary.comment('WARC-Profile value is for a different version:', version, value) else: - commentary.comment('unknown value, perhaps an extension:', field, value) + commentary.comment('Unknown value, perhaps an extension:', field, value) if '/revisit/uri-agnostic-identical-payload-digest' in value: commentary.comment('This Heretrix extension never made it into the standard:', field, value) @@ -448,26 +451,26 @@ def validate_profile(field, value, record, version, commentary, pending): def validate_segment_number(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer:', field, value) + commentary.error('Must be an integer:', field, value) return iv = int(value) if iv == 0: - commentary.error('must be 1 or greater:', field, value) + commentary.error('Must be 1 or greater:', field, value) rec_type = record.rec_headers.get_header('WARC-Type', 'none') if rec_type != 'continuation': if iv != 1: - commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value) + commentary.error('Non-continuation records must always have WARC-Segment-Number: 1:', field, value) origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID') if origin_id is None: - commentary.error('segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID') + commentary.error('Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID') if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: - commentary.recommendation('do not segment WARC-Type', rec_type) + commentary.recommendation('Do not segment WARC-Type', rec_type) def validate_segment_total_length(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer:', field, value) + commentary.error('Must be an integer:', field, value) def validate_refers_to_filename(field, value, record, version, commentary, pending): @@ -525,6 +528,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe 'validate': validate_profile, }, 'WARC-Identified-Payload-Type': { + # see also https://github.com/iipc/warc-specifications/issues/49 -- odd that it's allowed for request, revisit, continuation 'validate': validate_content_type, }, 'WARC-Segment-Origin-ID': { @@ -565,7 +569,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type', 'WARC-Target-URI'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_response, }, @@ -605,7 +609,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe }, 'conversion': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], - 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_conversion, }, @@ -613,7 +617,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'], 'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'], - 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile', 'WARC-Identified-Payload-Type'], 'validate': validate_continuation, }, } @@ -629,17 +633,17 @@ def make_header_set(config, kinds): def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): for req in sorted(config.get('required', [])): if not rec_headers.get_header(req): - commentary.error('missing required header:', req) + commentary.error('Missing required header:', req) for rec in sorted(config.get('recommended', [])): if not rec_headers.get_header(rec): - commentary.recommendation('missing recommended header:', rec) + commentary.recommendation('Missing recommended header:', rec) allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored')) prohibited = make_header_set(config, ('prohibited',)) for field, value in rec_headers.headers: # XXX not exported fl = field.lower() if fl in prohibited: - commentary.error('field not allowed in record type:', rec_type, field) + commentary.error('Field not allowed in record type:', rec_type, field) elif allow_all or fl in allowed: pass elif fl in warc_fields: # pragma: no cover (this is a tester.py configuration omission) @@ -666,15 +670,15 @@ def validate_record(record): for field, value in record.rec_headers.headers: # XXX not exported field_l = field.lower() if field_l != 'warc-concurrent-to' and field_l in seen_fields: - commentary.error('duplicate field seen:', field, value) + commentary.error('Duplicate field seen:', field, value) seen_fields.add(field_l) if field_l not in warc_fields: - commentary.comment('unknown field, no validation performed:', field, value) + commentary.comment('Unknown field, no validation performed:', field, value) continue config = warc_fields[field_l] if 'minver' in config: if version < config['minver']: - commentary.comment('field was introduced after this warc version:', version, field, value) + commentary.comment('Field was introduced after this warc version:', version, field, value) if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) @@ -780,13 +784,13 @@ def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, ta return if target_field.lower() not in all_records[wanted_id]: - commentary.comment('revisit target lacks field:', wanted_id, target_field) + commentary.comment('Revisit target lacks field:', wanted_id, target_field) return source_value = fields[source_field.lower()] target_value = all_records[wanted_id][target_field.lower()] if source_value != target_value: - commentary.comment('revisit and revisit target disagree:', + commentary.comment('Revisit and revisit target disagree:', record_id, source_field, source_value, wanted_id, target_field, target_value) @@ -827,7 +831,7 @@ def check_global_segment(all_records): pass -def _process_one(warcfile, all_records, concurrent_to): +def _process_one(warcfile, all_records, concurrent_to, verbose): if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): return with open(warcfile, 'rb') as stream: @@ -842,7 +846,7 @@ def _process_one(warcfile, all_records, concurrent_to): record.stream_for_digest_check() - if commentary.has_comments() or record.digest_checker.passed is False: + if verbose or commentary.has_comments() or record.digest_checker.passed is False: print(' ', 'WARC-Record-ID', commentary.record_id()) print(' ', 'WARC-Type', commentary.rec_type()) @@ -865,6 +869,7 @@ def _process_one(warcfile, all_records, concurrent_to): class Tester(object): def __init__(self, cmd): self.inputs = cmd.inputs + self.verbose = cmd.verbose self.exit_value = 0 self.all_records = defaultdict(dict) self.concurrent_to = defaultdict(list) @@ -875,12 +880,12 @@ def process_all(self): try: self.process_one(warcfile) except ArchiveLoadFailed as e: - print(' saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr) - print(' skipping rest of file', file=sys.stderr) + print(' saw exception ArchiveLoadFailed: '+str(e).rstrip()) + print(' skipping rest of file') check_global(self.all_records, self.concurrent_to) return self.exit_value def process_one(self, warcfile): - _process_one(warcfile, self.all_records, self.concurrent_to) + _process_one(warcfile, self.all_records, self.concurrent_to, self.verbose) From 5b716b7fbf7fc26b5d11171c97f3e2f98f9f100f Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 31 Jan 2019 21:49:41 -0800 Subject: [PATCH 25/68] ... --- test/test_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_tests.py b/test/test_tests.py index ebbdb509..9c3c9fec 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -342,7 +342,7 @@ def test_leftovers(): warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) expected = '''\ -error: must be an integer: Content-Length not-an-integer +error: Must be an integer: Content-Length not-an-integer ''' assert '\n'.join(commentary.comments())+'\n' == expected From fb8e3faa9556df79b26024baedb5533d2089c314 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 1 Feb 2019 10:25:40 -0800 Subject: [PATCH 26/68] revisits and global detection with just one file --- warcio/tester.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index 9605ea7b..68f108b2 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -722,7 +722,10 @@ def save_global_info(record, warcfile, commentary, all_records, concurrent_to): save['warc-concurrent-to'].append(value) if record_id in all_records: - commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile']) + if warcfile != all_records[record_id]['warcfile']: + commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile']) + else: + commentary.error('Duplicate WARC-Record-ID:', record_id) else: all_records[record_id] = save @@ -853,9 +856,12 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): if record.digest_checker.passed is True: print(' digest pass') elif record.digest_checker.passed is None: - if digest_present: # pragma: no cover - # WARC record missing Content-Length: header, which is verboten - print(' digest present but not checked') + if digest_present: + if commentary.rec_type() == 'revisit': + print(' digest present but not checked (revisit)') + else: # pragma: no cover + # WARC record missing Content-Length: header, which is verboten + print(' digest present but not checked') else: print(' digest not present') for p in record.digest_checker.problems: From d2436323fb568fb8729e2d7d3404ebbe8ce29b60 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 1 Feb 2019 15:47:01 -0800 Subject: [PATCH 27/68] show errors for decompression and unchunking failures --- test/test_tests.py | 14 +++++++------- warcio/archiveiterator.py | 5 +++-- warcio/bufferedreaders.py | 17 ++++++++++++++--- warcio/recordloader.py | 10 ++++++---- warcio/tester.py | 12 ++++++++++-- 5 files changed, 40 insertions(+), 18 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index 9c3c9fec..200df8ae 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -297,17 +297,17 @@ def test_digests(): WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc + error: Duplicate WARC-Record-ID: WARC-Record-ID WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc + error: Duplicate WARC-Record-ID: WARC-Record-ID WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc + error: Duplicate WARC-Record-ID: test/data/example.warc WARC-Record-ID WARC-Type request @@ -316,11 +316,11 @@ def test_digests(): error: Duplicate WARC-Record-ID: found in files test/data/example.warc test/data/example-digest-bad.warc WARC-Record-ID WARC-Type revisit - digest present but not checked - recommendation: missing recommended header: WARC-Refers-To + digest present but not checked (revisit) + recommendation: Missing recommended header: WARC-Refers-To comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest - comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ - comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z WARC-Record-ID WARC-Type request digest not present diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py index 0d1fe2dd..176acb1c 100644 --- a/warcio/archiveiterator.py +++ b/warcio/archiveiterator.py @@ -43,13 +43,14 @@ class ArchiveIterator(six.Iterator): def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE, - check_digests=False, fixup_bugs=True): + check_digests=False, fixup_bugs=True, raise_exceptions=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc, - fixup_bugs=fixup_bugs) + fixup_bugs=fixup_bugs, + raise_exceptions=raise_exceptions) self.known_format = None self.mixed_arc_warc = arc2warc diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 5b11522b..4ad52f6d 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -36,6 +36,13 @@ def brotli_decompressor(): pass +#================================================================= +class DecompressionException(Exception): + def __init__(self, msg, data=b''): + Exception.__init__(self, msg) + self.data = data + + #================================================================= class BufferedReader(object): """ @@ -64,7 +71,8 @@ class BufferedReader(object): def __init__(self, stream, block_size=BUFF_SIZE, decomp_type=None, starting_data=None, - read_all_members=False): + read_all_members=False, + raise_exceptions=False): self.stream = stream self.block_size = block_size @@ -77,6 +85,7 @@ def __init__(self, stream, block_size=BUFF_SIZE, self.buff_size = 0 self.read_all_members = read_all_members + self.raise_exceptions = raise_exceptions def set_decomp(self, decomp_type): self._init_decomp(decomp_type) @@ -142,6 +151,8 @@ def _decompress(self, data): self._init_decomp('deflate_alt') data = self._decompress(data) else: + if self.raise_exceptions: + raise DecompressionException(str(e)) self.decompressor = None # otherwise (partly decompressed), something is wrong else: @@ -280,13 +291,13 @@ class ChunkedDataReader(BufferedReader): If at any point the chunked header is not available, the stream is assumed to not be chunked and no more dechunking occurs. """ - def __init__(self, stream, raise_exceptions=False, **kwargs): + def __init__(self, stream, **kwargs): super(ChunkedDataReader, self).__init__(stream, **kwargs) self.all_chunks_read = False self.not_chunked = False # if False, we'll use best-guess fallback for parse errors - self.raise_chunked_data_exceptions = raise_exceptions + self.raise_chunked_data_exceptions = kwargs.get('raise_exceptions') def _fillbuff(self, block_size=None): if self.not_chunked: diff --git a/warcio/recordloader.py b/warcio/recordloader.py index 1f17d1f0..d00e8642 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -20,6 +20,7 @@ def __init__(self, *args, **kwargs): self.http_headers, self.content_type, self.length) = args self.payload_length = -1 self.digest_checker = kwargs.get('digest_checker') + self.raise_exceptions = kwargs.get('raise_exceptions') def content_stream(self): if not self.http_headers: @@ -34,9 +35,9 @@ def content_stream(self): encoding = None if self.http_headers.get_header('transfer-encoding') == 'chunked': - return ChunkedDataReader(self.raw_stream, decomp_type=encoding) + return ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) elif encoding: - return BufferedReader(self.raw_stream, decomp_type=encoding) + return BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) else: return self.raw_stream @@ -55,7 +56,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): + def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_exceptions=False): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -66,6 +67,7 @@ def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) self.fixup_bugs = fixup_bugs + self.raise_exceptions = raise_exceptions def parse_record_stream(self, stream, statusline=None, @@ -147,7 +149,7 @@ def parse_record_stream(self, stream, return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, - content_type, length, digest_checker=digest_checker) + content_type, length, digest_checker=digest_checker, raise_exceptions=self.raise_exceptions) def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None): payload_digest = rec_headers.get_header('WARC-Payload-Digest') diff --git a/warcio/tester.py b/warcio/tester.py index 68f108b2..84167c4c 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -8,6 +8,7 @@ from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester from warcio.exceptions import ArchiveLoadFailed +from warcio.bufferedreaders import ChunkedDataException, DecompressionException class Commentary(object): @@ -838,7 +839,7 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): return with open(warcfile, 'rb') as stream: - for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): + for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True): record = WrapRecord(record) digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or @@ -847,7 +848,14 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): commentary = validate_record(record) save_global_info(record, warcfile, commentary, all_records, concurrent_to) - record.stream_for_digest_check() + try: + record.stream_for_digest_check() + except ChunkedDataException: + commentary.error('Transfer-Encoding: chunked, saw an error attempting to unchunk') + pass + except DecompressionException as e: + commentary.error('Content-Encoding indicates compression, saw an error attempting to decompress: '+str(e)) + pass if verbose or commentary.has_comments() or record.digest_checker.passed is False: print(' ', 'WARC-Record-ID', commentary.record_id()) From 29517c41697e011a37e210a93f1e464f3a934bc6 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 1 Feb 2019 22:13:07 -0800 Subject: [PATCH 28/68] make this function reentrant --- warcio/recordloader.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/warcio/recordloader.py b/warcio/recordloader.py index d00e8642..f8a47db4 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -21,11 +21,15 @@ def __init__(self, *args, **kwargs): self.payload_length = -1 self.digest_checker = kwargs.get('digest_checker') self.raise_exceptions = kwargs.get('raise_exceptions') + self._content_stream = None def content_stream(self): if not self.http_headers: return self.raw_stream + if self._content_stream: + return self._content_stream + encoding = self.http_headers.get_header('content-encoding') if encoding: @@ -35,11 +39,13 @@ def content_stream(self): encoding = None if self.http_headers.get_header('transfer-encoding') == 'chunked': - return ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) + self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) elif encoding: - return BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) + self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) else: - return self.raw_stream + self._content_stream = self.raw_stream + + return self._content_stream #================================================================= From 844807e63d6a0a98b75080e63b8a5192b764aecc Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 1 Feb 2019 22:13:26 -0800 Subject: [PATCH 29/68] narrow exception; fix bug not reading to the end of a chunked buffer --- warcio/bufferedreaders.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 4ad52f6d..97325b7d 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -38,9 +38,8 @@ def brotli_decompressor(): #================================================================= class DecompressionException(Exception): - def __init__(self, msg, data=b''): + def __init__(self, msg): Exception.__init__(self, msg) - self.data = data #================================================================= @@ -144,7 +143,7 @@ def _decompress(self, data): if self.decompressor and data: try: data = self.decompressor.decompress(data) - except Exception as e: + except zlib.error as e: # if first read attempt, assume non-gzipped stream if self.num_block_read == 0: if self.decomp_type == 'deflate': @@ -342,6 +341,8 @@ def _try_decode(self, length_header): if not chunk_size: # chunk_size 0 indicates end of file + final_data = self.stream.read(2) + assert(final_data == b'\r\n') self.all_chunks_read = True self._process_read(b'') return From a55afd311512ca11b96be25f94043ae26fddaf47 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 2 Feb 2019 09:30:51 -0800 Subject: [PATCH 30/68] ... --- warcio/tester.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index 84167c4c..84ea75c3 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -840,6 +840,7 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): return with open(warcfile, 'rb') as stream: for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True): + #for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): record = WrapRecord(record) digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or @@ -850,11 +851,11 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): try: record.stream_for_digest_check() - except ChunkedDataException: - commentary.error('Transfer-Encoding: chunked, saw an error attempting to unchunk') + except ChunkedDataException as e: + commentary.comment('Transfer-Encoding: chunked, saw exception: '+str(e)) pass except DecompressionException as e: - commentary.error('Content-Encoding indicates compression, saw an error attempting to decompress: '+str(e)) + commentary.comment('Content-Encoding indicates compression, saw: '+str(e)) pass if verbose or commentary.has_comments() or record.digest_checker.passed is False: From a33a5eb104e969ee8af3abbe2df0a925e641608a Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Wed, 6 Feb 2019 11:53:02 -0800 Subject: [PATCH 31/68] put tester output in external files --- test/data/example-digest-bad.warc.test | 22 ++ test/data/example.warc.test | 16 + .../standard-torture-validate-field.warc.test | 80 ++++ ...standard-torture-validate-record.warc.test | 112 ++++++ test/test_tester.py | 96 +++++ test/test_tests.py | 348 ------------------ 6 files changed, 326 insertions(+), 348 deletions(-) create mode 100644 test/data/example-digest-bad.warc.test create mode 100644 test/data/example.warc.test create mode 100644 test/data/standard-torture-validate-field.warc.test create mode 100644 test/data/standard-torture-validate-record.warc.test create mode 100644 test/test_tester.py delete mode 100644 test/test_tests.py diff --git a/test/data/example-digest-bad.warc.test b/test/data/example-digest-bad.warc.test new file mode 100644 index 00000000..15a5efaf --- /dev/null +++ b/test/data/example-digest-bad.warc.test @@ -0,0 +1,22 @@ +test/data/example-digest-bad.warc + WARC-Record-ID + WARC-Type request + payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: +global Concurrent-To checks + comment: WARC-Concurrent-To not found: WARC-Concurrent-To diff --git a/test/data/example.warc.test b/test/data/example.warc.test new file mode 100644 index 00000000..52b3c79f --- /dev/null +++ b/test/data/example.warc.test @@ -0,0 +1,16 @@ +test/data/example.warc + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type revisit + digest present but not checked (revisit) + recommendation: Missing recommended header: WARC-Refers-To + comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests diff --git a/test/data/standard-torture-validate-field.warc.test b/test/data/standard-torture-validate-field.warc.test new file mode 100644 index 00000000..de2e3fe1 --- /dev/null +++ b/test/data/standard-torture-validate-field.warc.test @@ -0,0 +1,80 @@ +test/data/standard-torture-validate-field.warc + WARC-Record-ID + WARC-Type does-not-exist + unknown hash algorithm name in block digest + error: uri must not be within <>: WARC-Target-URI + error: Duplicate field seen: WARC-Target-URI example.com + error: Invalid uri, no scheme: WARC-Target-URI example.com + error: Duplicate field seen: WARC-Target-URI ex ample.com + error: Invalid uri, no scheme: WARC-Target-URI ex ample.com + error: Invalid uri, contains whitespace: WARC-Target-URI ex ample.com + error: Duplicate field seen: WARC-Target-URI h<>ttp://example.com/ + error: Invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/ + error: Duplicate field seen: WARC-Type CAPITALIZED + error: uri must be within <>: WARC-Concurrent-To http://example.com/ + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z + error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z + error: Must contain a /: Content-Type asdf + error: Invalid subtype: Content-Type asdf + error: Duplicate field seen: Content-Type has space/asdf + error: Invalid type: Content-Type has space/asdf + error: Duplicate field seen: Content-Type asdf/has space + error: Invalid subtype: Content-Type asdf/has space + error: Duplicate field seen: Content-Type asdf/has space;asdf + error: Invalid subtype: Content-Type asdf/has space;asdf + error: Missing algorithm: WARC-Block-Digest asdf + error: Duplicate field seen: WARC-Block-Digest has space:asdf + error: Invalid algorithm: WARC-Block-Digest has space:asdf + error: Duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^ + error: Invalid ip: WARC-IP-Address 1.2.3.4.5 + error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf + error: Duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: Must contain a /: WARC-Identified-Payload-Type asdf + error: Invalid subtype: WARC-Identified-Payload-Type asdf + error: uri must be within <>: WARC-Segment-Origin-ID http://example.com + error: Must be an integer: WARC-Segment-Number not-an-integer + error: Duplicate field seen: WARC-Segment-Number 0 + error: Must be 1 or greater: WARC-Segment-Number 0 + error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0 + error: Duplicate field seen: WARC-Segment-Number 1 + error: Duplicate field seen: WARC-Segment-Number 2 + error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 + error: Duplicate field seen: WARC-Segment-Total-Length not-an-integer + error: Must be an integer: WARC-Segment-Total-Length not-an-integer + error: Invalid timestamp: WARC-Refers-To-Date not-a-date + comment: Unknown WARC-Type: WARC-Type does-not-exist + comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED + comment: Unknown WARC-Type: WARC-Type CAPITALIZED + comment: Unknown digest algorithm: WARC-Block-Digest asdf + comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ + comment: Unknown value, perhaps an extension: WARC-Truncated invalid + comment: Unknown value, perhaps an extension: WARC-Profile asdf + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234 + comment: Unknown field, no validation performed: WARC-Unknown-Field asdf + WARC-Record-ID None + WARC-Type invalid + digest not present + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z + comment: Unknown WARC-Type: WARC-Type invalid + WARC-Record-ID None + WARC-Type request + digest not present + error: Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Record-ID + error: Missing required header: WARC-Target-URI + recommendation: Do not segment WARC-Type request + saw exception ArchiveLoadFailed: Invalid WARC record, first line: WARC/invalid + skipping rest of file +global warcinfo checks + comment: WARC-Warcinfo-ID not found: WARC-Warcinfo-ID asdf:asdf +global Concurrent-To checks + comment: WARC-Concurrent-To not found: WARC-Concurrent-To + comment: WARC-Concurrent-To not found: WARC-Concurrent-To http://example.com/ diff --git a/test/data/standard-torture-validate-record.warc.test b/test/data/standard-torture-validate-record.warc.test new file mode 100644 index 00000000..e7b17345 --- /dev/null +++ b/test/data/standard-torture-validate-record.warc.test @@ -0,0 +1,112 @@ +test/data/standard-torture-validate-record.warc + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: uri must be within <>: WARC-Refers-To probhibited + error: Missing required header: WARC-Date + error: Missing required header: WARC-Record-ID + error: Field not allowed in record type: warcinfo WARC-Refers-To + error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte + comment: The first line of warc-fields cannot start with whitespace + comment: warc-fields lines must end with \r\n: test: lines should end with \r\n + comment: Missing colon in warc-fields line: no colon + comment: Invalid warc-fields name: token cannot have a space + WARC-Record-ID + WARC-Type warcinfo + digest not present + error: Missing required header: WARC-Date + comment: warc-fields block present but empty + WARC-Record-ID + WARC-Type warcinfo + digest not present + error: Missing required header: WARC-Date + recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields + WARC-Record-ID + WARC-Type response + digest not present + error: Missing required header: WARC-Date + error: Responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain + error: WARC-IP-Address should be used for http and https responses + error: http/https responses should have http headers + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: WARC-Date + error: resource records for dns shall have Content-Type of text/dns: text/plain + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: WARC-Date + comment: Unknown field, no validation performed: WARC-Test-TODO add another with valid block + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + WARC-Record-ID + WARC-Type request + digest not present + error: Missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest not present + error: Missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain + WARC-Record-ID + WARC-Type metadata + digest not present + error: Missing required header: WARC-Date + comment: warc-fields block present but empty + WARC-Record-ID + WARC-Type metadata + digest not present + error: Missing required header: WARC-Date + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + comment: Unknown value, perhaps an extension: WARC-Profile none + comment: No revisit details validation done due to unknown profile: none + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + error: Missing required header: WARC-Payload-Digest + recommendation: Missing recommended header: WARC-Refers-To + recommendation: Missing recommended header: WARC-Refers-To-Date + recommendation: Missing recommended header: WARC-Refers-To-Target-URI + comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + recommendation: Missing recommended header: WARC-Refers-To + recommendation: Missing recommended header: WARC-Refers-To-Date + WARC-Record-ID + WARC-Type conversion + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + WARC-Record-ID + WARC-Type continuation + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Segment-Origin-ID + error: Missing required header: WARC-Target-URI + error: continuation record must have WARC-Segment-Number > 1: 1 + comment: warcio test continuation code has not been tested, expect bugs + WARC-Record-ID + WARC-Type continuation + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Segment-Origin-ID + error: Missing required header: WARC-Target-URI + comment: warcio test continuation code has not been tested, expect bugs diff --git a/test/test_tester.py b/test/test_tester.py new file mode 100644 index 00000000..49b1cc6d --- /dev/null +++ b/test/test_tester.py @@ -0,0 +1,96 @@ +from warcio.cli import main +from warcio.utils import to_native_str +import warcio.tester + +from . import get_test_file +from .test_cli import patch_stdout + + +file_map = {} + + +def map_test_file(filename): + file_map[filename] = get_test_file(filename) + return file_map[filename] + + +def helper(args, expected_exit_value): + with patch_stdout() as buff: + exit_value = None + try: + main(args=args) + except SystemExit as e: + exit_value = e.code + finally: + assert exit_value == expected_exit_value + + return to_native_str(buff.getvalue()) + + +def remove_before_test_data(s): + ret = '' + for line in s.splitlines(True): + for filename, value in file_map.items(): + if value in line: + line = line.replace(value, 'test/data/' + filename) + ret += line + return ret + + +def run_one(f): + args = ['test'] + args.append(f) + + with open(f+'.test', 'r') as expectedf: + expected = expectedf.read() + + value = helper(args, 0) + print(remove_before_test_data(value)) + + actual = remove_before_test_data(value) + + assert actual == expected + + +def test_torture(): + files = ['standard-torture-validate-record.warc', + 'standard-torture-validate-field.warc'] + [run_one(map_test_file(filename)) for filename in files] + + +def test_arc(): + files = ['does-not-exist.arc'] + files = [map_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/does-not-exist.arc +""" + + value = helper(args, 0) + assert remove_before_test_data(value) == expected + + +def test_digests(): + # needed for test coverage + files = ['example-digest-bad.warc', 'example.warc'] + [run_one(map_test_file(filename)) for filename in files] + + +def test_leftovers(): + commentary = warcio.tester.Commentary('id', 'type') + assert not commentary.has_comments() + + # hard to test because invalid WARC Content-Length raises in archiveiterator + warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) + + # hard to test because warcio raises for unknown WARC version + warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) + + expected = '''\ +error: Must be an integer: Content-Length not-an-integer +''' + + assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/test/test_tests.py b/test/test_tests.py deleted file mode 100644 index 200df8ae..00000000 --- a/test/test_tests.py +++ /dev/null @@ -1,348 +0,0 @@ -from warcio.cli import main -from warcio.utils import to_native_str -import warcio.tester - -from . import get_test_file -from .test_cli import patch_stdout - - -file_map = {} - - -def map_test_file(filename): - file_map[filename] = get_test_file(filename) - return file_map[filename] - - -def helper(args, expected_exit_value): - with patch_stdout() as buff: - exit_value = None - try: - main(args=args) - except SystemExit as e: - exit_value = e.code - finally: - assert exit_value == expected_exit_value - - return to_native_str(buff.getvalue()) - - -def remove_before_test_data(s): - ret = '' - for line in s.splitlines(True): - for filename, value in file_map.items(): - if value in line: - line = line.replace(value, 'test/data/' + filename) - ret += line - return ret - - -def test_torture_validate_record(): - files = ['standard-torture-validate-record.warc'] - files = [map_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/standard-torture-validate-record.warc - WARC-Record-ID None - WARC-Type warcinfo - digest not present - error: uri must be within <>: WARC-Refers-To probhibited - error: missing required header: WARC-Date - error: missing required header: WARC-Record-ID - error: field not allowed in record type: warcinfo WARC-Refers-To - error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte - comment: The first line of warc-fields cannot start with whitespace - comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n - comment: Missing colon in warc-fields line: no colon - comment: Invalid warc-fields name: token cannot have a space - WARC-Record-ID - WARC-Type warcinfo - digest not present - error: missing required header: WARC-Date - comment: warc-fields block present but empty - WARC-Record-ID - WARC-Type warcinfo - digest not present - error: missing required header: WARC-Date - recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields - WARC-Record-ID - WARC-Type response - digest not present - error: missing required header: WARC-Date - error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain - error: WARC-IP-Address should be used for http and https responses - error: http/https responses should have http headers - WARC-Record-ID - WARC-Type resource - digest not present - error: missing required header: WARC-Date - error: resource records for dns shall have Content-Type of text/dns: text/plain - WARC-Record-ID - WARC-Type resource - digest not present - error: missing required header: WARC-Date - comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block - WARC-Record-ID - WARC-Type resource - digest not present - error: missing required header: Content-Type - error: missing required header: WARC-Date - WARC-Record-ID - WARC-Type request - digest not present - error: missing required header: WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain - error: WARC-IP-Address should be used for http and https requests - WARC-Record-ID - WARC-Type request - digest not present - error: missing required header: WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain - WARC-Record-ID - WARC-Type metadata - digest not present - error: missing required header: WARC-Date - comment: warc-fields block present but empty - WARC-Record-ID - WARC-Type metadata - digest not present - error: missing required header: WARC-Date - WARC-Record-ID - WARC-Type revisit - digest not present - error: missing required header: Content-Type - error: missing required header: WARC-Date - error: missing required header: WARC-Target-URI - comment: unknown value, perhaps an extension: WARC-Profile none - comment: no revisit details validation done due to unknown profile: none - WARC-Record-ID - WARC-Type revisit - digest not present - error: missing required header: Content-Type - error: missing required header: WARC-Date - error: missing required header: WARC-Target-URI - error: missing required header: WARC-Payload-Digest - recommendation: missing recommended header: WARC-Refers-To - recommendation: missing recommended header: WARC-Refers-To-Date - recommendation: missing recommended header: WARC-Refers-To-Target-URI - comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest - WARC-Record-ID - WARC-Type revisit - digest not present - error: missing required header: Content-Type - error: missing required header: WARC-Date - error: missing required header: WARC-Target-URI - recommendation: missing recommended header: WARC-Refers-To - recommendation: missing recommended header: WARC-Refers-To-Date - WARC-Record-ID - WARC-Type conversion - digest not present - error: missing required header: WARC-Date - error: missing required header: WARC-Target-URI - WARC-Record-ID - WARC-Type continuation - digest not present - error: missing required header: WARC-Date - error: missing required header: WARC-Segment-Origin-ID - error: missing required header: WARC-Target-URI - error: continuation record must have WARC-Segment-Number > 1: 1 - comment: warcio test continuation code has not been tested, expect bugs - WARC-Record-ID - WARC-Type continuation - digest not present - error: missing required header: WARC-Date - error: missing required header: WARC-Segment-Origin-ID - error: missing required header: WARC-Target-URI - comment: warcio test continuation code has not been tested, expect bugs -""" - - value = helper(args, 0) - print(remove_before_test_data(value)) - - actual = remove_before_test_data(value) - - assert actual == expected - - -def test_torture_validate_field(): - files = ['standard-torture-validate-field.warc'] - files = [map_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/standard-torture-validate-field.warc - WARC-Record-ID - WARC-Type does-not-exist - unknown hash algorithm name in block digest - error: uri must not be within <>: WARC-Target-URI - error: invalid uri scheme, bad character: WARC-Target-URI - error: duplicate field seen: WARC-Target-URI example.com - error: invalid uri, no scheme: WARC-Target-URI example.com - error: duplicate field seen: WARC-Target-URI ex ample.com - error: invalid uri, no scheme: WARC-Target-URI ex ample.com - error: invalid uri, contains whitespace: WARC-Target-URI ex ample.com - error: invalid uri scheme, bad character: WARC-Target-URI ex ample.com - error: duplicate field seen: WARC-Target-URI h<>ttp://example.com/ - error: invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/ - error: duplicate field seen: WARC-Type CAPITALIZED - error: uri must be within <>: WARC-Concurrent-To http://example.com/ - error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z - error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z - error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z - error: must contain a /: Content-Type asdf - error: invalid subtype: Content-Type asdf - error: duplicate field seen: Content-Type has space/asdf - error: invalid type: Content-Type has space/asdf - error: duplicate field seen: Content-Type asdf/has space - error: invalid subtype: Content-Type asdf/has space - error: duplicate field seen: Content-Type asdf/has space;asdf - error: invalid subtype: Content-Type asdf/has space;asdf - error: missing algorithm: WARC-Block-Digest asdf - error: duplicate field seen: WARC-Block-Digest has space:asdf - error: invalid algorithm: WARC-Block-Digest has space:asdf - error: duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^ - error: invalid ip: WARC-IP-Address 1.2.3.4.5 - error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf - error: duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest - error: must contain a /: WARC-Identified-Payload-Type asdf - error: invalid subtype: WARC-Identified-Payload-Type asdf - error: uri must be within <>: WARC-Segment-Origin-ID http://example.com - error: must be an integer: WARC-Segment-Number not-an-integer - error: duplicate field seen: WARC-Segment-Number 0 - error: must be 1 or greater: WARC-Segment-Number 0 - error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0 - error: duplicate field seen: WARC-Segment-Number 1 - error: duplicate field seen: WARC-Segment-Number 2 - error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 - error: duplicate field seen: WARC-Segment-Total-Length not-an-integer - error: must be an integer: WARC-Segment-Total-Length not-an-integer - error: Invalid timestamp: WARC-Refers-To-Date not-a-date - comment: unknown WARC-Type: WARC-Type does-not-exist - comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED - comment: unknown WARC-Type: WARC-Type CAPITALIZED - comment: unknown digest algorithm: WARC-Block-Digest asdf - comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ - comment: unknown value, perhaps an extension: WARC-Truncated invalid - comment: unknown value, perhaps an extension: WARC-Profile asdf - comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com - comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date - comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf - comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234 - comment: unknown field, no validation performed: WARC-Unknown-Field asdf - WARC-Record-ID None - WARC-Type invalid - digest not present - error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z - error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z - error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z - comment: unknown WARC-Type: WARC-Type invalid - WARC-Record-ID None - WARC-Type request - digest not present - error: segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID - error: missing required header: Content-Type - error: missing required header: WARC-Date - error: missing required header: WARC-Record-ID - error: missing required header: WARC-Target-URI - recommendation: do not segment WARC-Type request -global warcinfo checks - comment: WARC-Warcinfo-ID not found: WARC-Warcinfo-ID asdf:asdf -global Concurrent-To checks - comment: WARC-Concurrent-To not found: WARC-Concurrent-To - comment: WARC-Concurrent-To not found: WARC-Concurrent-To http://example.com/ -""" - - value = helper(args, 0) - actual = remove_before_test_data(value) - - print(actual) - assert actual == expected - - -def test_arc(): - files = ['does-not-exist.arc'] - files = [map_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/does-not-exist.arc -""" - - value = helper(args, 0) - assert remove_before_test_data(value) == expected - - -def test_digests(): - # needed for test coverage - files = ['example-digest-bad.warc', 'example.warc'] - files = [map_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/example-digest-bad.warc - WARC-Record-ID - WARC-Type request - payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - error: WARC-IP-Address should be used for http and https requests - WARC-Record-ID - WARC-Type request - digest pass - error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: - WARC-Record-ID - WARC-Type request - digest pass - error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: - WARC-Record-ID - WARC-Type request - digest pass - error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: -test/data/example.warc - WARC-Record-ID - WARC-Type request - digest not present - error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: found in files test/data/example.warc test/data/example-digest-bad.warc - WARC-Record-ID - WARC-Type revisit - digest present but not checked (revisit) - recommendation: Missing recommended header: WARC-Refers-To - comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest - comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ - comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z - WARC-Record-ID - WARC-Type request - digest not present - error: WARC-IP-Address should be used for http and https requests -""" - - value = helper(args, 0) - assert remove_before_test_data(value) == expected - - -def test_leftovers(): - commentary = warcio.tester.Commentary('id', 'type') - assert not commentary.has_comments() - - # hard to test because invalid WARC Content-Length raises in archiveiterator - warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) - - # hard to test because warcio raises for unknown WARC version - warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) - - expected = '''\ -error: Must be an integer: Content-Length not-an-integer -''' - - assert '\n'.join(commentary.comments())+'\n' == expected From fec139ac253022895d4b864b3c73832a7c8c9a90 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 4 Apr 2019 15:01:31 -0700 Subject: [PATCH 32/68] wip --- test/test_tester.py | 2 +- warcio/archiveiterator.py | 5 ++- warcio/bufferedreaders.py | 49 +++++++++++++------------ warcio/recordloader.py | 43 ++++++++++++++++++---- warcio/tester.py | 76 +++++++++------------------------------ 5 files changed, 82 insertions(+), 93 deletions(-) diff --git a/test/test_tester.py b/test/test_tester.py index 49b1cc6d..08963ea9 100644 --- a/test/test_tester.py +++ b/test/test_tester.py @@ -80,7 +80,7 @@ def test_digests(): def test_leftovers(): - commentary = warcio.tester.Commentary('id', 'type') + commentary = warcio.recordloader.Commentary() assert not commentary.has_comments() # hard to test because invalid WARC Content-Length raises in archiveiterator diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py index 176acb1c..0d1fe2dd 100644 --- a/warcio/archiveiterator.py +++ b/warcio/archiveiterator.py @@ -43,14 +43,13 @@ class ArchiveIterator(six.Iterator): def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE, - check_digests=False, fixup_bugs=True, raise_exceptions=False): + check_digests=False, fixup_bugs=True): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc, - fixup_bugs=fixup_bugs, - raise_exceptions=raise_exceptions) + fixup_bugs=fixup_bugs) self.known_format = None self.mixed_arc_warc = arc2warc diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 97325b7d..e07dcd4b 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -36,12 +36,6 @@ def brotli_decompressor(): pass -#================================================================= -class DecompressionException(Exception): - def __init__(self, msg): - Exception.__init__(self, msg) - - #================================================================= class BufferedReader(object): """ @@ -71,7 +65,7 @@ def __init__(self, stream, block_size=BUFF_SIZE, decomp_type=None, starting_data=None, read_all_members=False, - raise_exceptions=False): + commentary=None): self.stream = stream self.block_size = block_size @@ -84,7 +78,7 @@ def __init__(self, stream, block_size=BUFF_SIZE, self.buff_size = 0 self.read_all_members = read_all_members - self.raise_exceptions = raise_exceptions + self.commentary = commentary def set_decomp(self, decomp_type): self._init_decomp(decomp_type) @@ -96,6 +90,10 @@ def _init_decomp(self, decomp_type): self.decomp_type = decomp_type self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]() except KeyError: + # XXX don't raise? + # we don't know if the enduser cares or not + # or the record might actually be uncompressed + # XXX what does pywb do raise Exception('Decompression type not supported: ' + decomp_type) else: @@ -150,8 +148,8 @@ def _decompress(self, data): self._init_decomp('deflate_alt') data = self._decompress(data) else: - if self.raise_exceptions: - raise DecompressionException(str(e)) + if self.commentary: + self.commentary.comment('Payload claimed to be compressed but apparently is not') self.decompressor = None # otherwise (partly decompressed), something is wrong else: @@ -290,40 +288,43 @@ class ChunkedDataReader(BufferedReader): If at any point the chunked header is not available, the stream is assumed to not be chunked and no more dechunking occurs. """ - def __init__(self, stream, **kwargs): + def __init__(self, stream, raise_exceptions=False, commentary=None, **kwargs): super(ChunkedDataReader, self).__init__(stream, **kwargs) self.all_chunks_read = False - self.not_chunked = False - - # if False, we'll use best-guess fallback for parse errors - self.raise_chunked_data_exceptions = kwargs.get('raise_exceptions') + self.not_actually_chunked = False + self.at_start = True + self.raise_chunked_data_exceptions = raise_exceptions + self.commentary = commentary def _fillbuff(self, block_size=None): - if self.not_chunked: + if self.not_actually_chunked: return super(ChunkedDataReader, self)._fillbuff(block_size) # Loop over chunks until there is some data (not empty()) # In particular, gzipped data may require multiple chunks to # return any decompressed result - while (self.empty() and - not self.all_chunks_read and - not self.not_chunked): - + while (self.empty() and not self.all_chunks_read): try: length_header = self.stream.readline(64) self._try_decode(length_header) + self.at_start = False except ChunkedDataException as e: if self.raise_chunked_data_exceptions: raise - # Can't parse the data as chunked. # It's possible that non-chunked data is served # with a Transfer-Encoding: chunked. # Treat this as non-chunk encoded from here on. + if self.commentary: + if self.at_start: + self.commentary.comment('Buffer claimed to be chunked, but was not from the start') + else: + self.commentary.comment('Buffer is chunked but there was an unchunking error midway') self._process_read(length_header + e.data) - self.not_chunked = True + self.not_actually_chunked = True + self.at_start = False - # parse as block as non-chunked + # parse as non-chunked return super(ChunkedDataReader, self)._fillbuff(block_size) def _try_decode(self, length_header): @@ -362,6 +363,8 @@ def _try_decode(self, length_header): msg = 'Ran out of data before end of chunk' raise ChunkedDataException(msg, data) else: + if self.commentary: + self.commentary.comment('Chunked reader ran out of data before end of chunk') chunk_size = data_len self.all_chunks_read = True diff --git a/warcio/recordloader.py b/warcio/recordloader.py index f8a47db4..6629c9e9 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -13,6 +13,36 @@ from six.moves import zip +#================================================================= +class Commentary(object): + def __init__(self): + self.errors = [] + self.recommendations = [] + self._comments = [] + + def error(self, *args): + self.errors.append(args) + + def recommendation(self, *args): + self.recommendations.append(args) + + def comment(self, *args): + self._comments.append(args) + + def has_comments(self): + if self.errors or self.recommendations or self._comments: + return True + + def comments(self): + # XXX str() all of these, in case an int or other thing slips in? + for e in self.errors: + yield 'error: ' + ' '.join(e) + for r in self.recommendations: + yield 'recommendation: ' + ' '.join(r) + for c in self._comments: + yield 'comment: ' + ' '.join(c) + + #================================================================= class ArcWarcRecord(object): def __init__(self, *args, **kwargs): @@ -20,7 +50,7 @@ def __init__(self, *args, **kwargs): self.http_headers, self.content_type, self.length) = args self.payload_length = -1 self.digest_checker = kwargs.get('digest_checker') - self.raise_exceptions = kwargs.get('raise_exceptions') + self.commentary = kwargs.get('commentary') self._content_stream = None def content_stream(self): @@ -39,9 +69,9 @@ def content_stream(self): encoding = None if self.http_headers.get_header('transfer-encoding') == 'chunked': - self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) + self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary) elif encoding: - self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) + self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary) else: self._content_stream = self.raw_stream @@ -62,7 +92,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_exceptions=False): + def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -73,7 +103,6 @@ def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_excep self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) self.fixup_bugs = fixup_bugs - self.raise_exceptions = raise_exceptions def parse_record_stream(self, stream, statusline=None, @@ -131,6 +160,7 @@ def parse_record_stream(self, stream, is_verifying = False digest_checker = DigestChecker(check_digests) + commentary = Commentary() # limit stream to the length for all valid records if length is not None and length >= 0: @@ -155,7 +185,8 @@ def parse_record_stream(self, stream, return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, - content_type, length, digest_checker=digest_checker, raise_exceptions=self.raise_exceptions) + content_type, length, digest_checker=digest_checker, + commentary=commentary) def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None): payload_digest = rec_headers.get_header('WARC-Payload-Digest') diff --git a/warcio/tester.py b/warcio/tester.py index 84ea75c3..cee5344f 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -8,45 +8,8 @@ from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester from warcio.exceptions import ArchiveLoadFailed -from warcio.bufferedreaders import ChunkedDataException, DecompressionException - - -class Commentary(object): - def __init__(self, record_id=None, rec_type=None): - self._record_id = record_id - self._rec_type = rec_type - self.errors = [] - self.recommendations = [] - self._comments = [] - - def record_id(self): - return self._record_id - - def rec_type(self): - return self._rec_type - - def error(self, *args): - self.errors.append(args) - - def recommendation(self, *args): - self.recommendations.append(args) - - def comment(self, *args): - self._comments.append(args) - - def has_comments(self): - if self.errors or self.recommendations or self._comments: - return True - - def comments(self): - # XXX str() all of these, in case an int or other thing slips in? - for e in self.errors: - yield 'error: ' + ' '.join(e) - for r in self.recommendations: - yield 'recommendation: ' + ' '.join(r) - for c in self._comments: - yield 'comment: ' + ' '.join(c) - +from warcio.bufferedreaders import ChunkedDataException +from warcio.recordloader import Commentary class WrapRecord(object): def __init__(self, obj): @@ -662,9 +625,7 @@ def validate_record_against_rec_type(config, record, commentary, pending): def validate_record(record): version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported - record_id = record.rec_headers.get_header('WARC-Record-ID') - rec_type = record.rec_headers.get_header('WARC-Type') - commentary = Commentary(record_id=record_id, rec_type=rec_type) + commentary = record.commentary pending = None seen_fields = set() @@ -683,6 +644,7 @@ def validate_record(record): if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) + rec_type = record.rec_headers.get_header('WARC-Type') if rec_type not in record_types: # we print a comment for this elsewhere pass @@ -839,37 +801,31 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): return with open(warcfile, 'rb') as stream: - for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True): - #for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): - + for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): record = WrapRecord(record) digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or record.rec_headers.get_header('WARC-Block-Digest')) + record_id = record.rec_headers.get_header('WARC-Record-ID') + rec_type = record.rec_headers.get_header('WARC-Type') - commentary = validate_record(record) - save_global_info(record, warcfile, commentary, all_records, concurrent_to) + validate_record(record) + record.stream_for_digest_check() - try: - record.stream_for_digest_check() - except ChunkedDataException as e: - commentary.comment('Transfer-Encoding: chunked, saw exception: '+str(e)) - pass - except DecompressionException as e: - commentary.comment('Content-Encoding indicates compression, saw: '+str(e)) - pass + commentary = record.commentary + save_global_info(record, warcfile, commentary, all_records, concurrent_to) if verbose or commentary.has_comments() or record.digest_checker.passed is False: - print(' ', 'WARC-Record-ID', commentary.record_id()) - print(' ', 'WARC-Type', commentary.rec_type()) - + print(' ', 'WARC-Record-ID', record_id) + print(' ', 'WARC-Type', rec_type) if record.digest_checker.passed is True: print(' digest pass') elif record.digest_checker.passed is None: if digest_present: - if commentary.rec_type() == 'revisit': + if rec_type == 'revisit': print(' digest present but not checked (revisit)') else: # pragma: no cover - # WARC record missing Content-Length: header, which is verboten + # should not ever happen + # example reason: WARC record missing Content-Length: header, but that case raises print(' digest present but not checked') else: print(' digest not present') From a471222c96589e7f5d7e7745aa3a2b72fcf2f2b8 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 4 Apr 2019 23:35:46 -0700 Subject: [PATCH 33/68] tweak to match new test files --- test/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_cli.py b/test/test_cli.py index 9e356912..103d5d9b 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -85,7 +85,7 @@ def test_check_valid(): args = ['check', '-v'] + filenames value = check_helper(args, 0) - assert value.count(b'digest pass') == 2 + assert value.count(b'digest pass') == 4 assert value.count(b'WARC-Record-ID') == 12 From 30a86fe1f888d38b27855f05807ae48fad5c4c3e Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 9 Sep 2019 11:03:30 -0700 Subject: [PATCH 34/68] tests pass --- test/test_check_digest_examples.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_check_digest_examples.py b/test/test_check_digest_examples.py index 679d7d24..89eb296f 100644 --- a/test/test_check_digest_examples.py +++ b/test/test_check_digest_examples.py @@ -9,7 +9,8 @@ 'example-iana.org-chunked.warc', 'example-wrong-chunks.warc.gz', 'example-bad-non-chunked.warc.gz', - 'example-digest.warc' + 'example-digest-bad.warc', + 'standard-torture-validate-field.warc', ] @@ -34,7 +35,7 @@ def check_helper(self, args, expected_exit_value, capsys): return capsys.readouterr()[0] # list for py33 support def test_check_invalid(self, capsys): - filenames = [get_test_file('example-digest.warc')] + filenames = [get_test_file('example-digest-bad.warc')] args = ['check'] + filenames value = self.check_helper(args, 1, capsys) From 19dc8b3e0e67f1384aba18fa56d34456165592d8 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 25 Jan 2019 16:05:38 -0800 Subject: [PATCH 35/68] warcio test --- ...le-digest.warc => example-digest-bad.warc} | 0 test/test_archiveiterator.py | 10 +- warcio/archiveiterator.py | 5 +- warcio/cli.py | 12 + warcio/recordloader.py | 10 +- warcio/tester.py | 638 ++++++++++++++++++ 6 files changed, 664 insertions(+), 11 deletions(-) rename test/data/{example-digest.warc => example-digest-bad.warc} (100%) create mode 100644 warcio/tester.py diff --git a/test/data/example-digest.warc b/test/data/example-digest-bad.warc similarity index 100% rename from test/data/example-digest.warc rename to test/data/example-digest-bad.warc diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py index 10914ce5..066b53fb 100644 --- a/test/test_archiveiterator.py +++ b/test/test_archiveiterator.py @@ -283,6 +283,8 @@ def test_err_arc_iterator_on_warc(self): def test_corrects_wget_bug(self): with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record: assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' + with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record: + assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' def test_corrects_space_in_target_uri(self): with self._find_first_by_type('example-space-in-target-uri.warc.gz', 'resource') as record: @@ -345,9 +347,9 @@ def test_digests_file(self): expected_t = ['request', 'request', 'request'] # record 1: invalid payload digest - assert self._load_archive('example-digest.warc', check_digests=True) == expected_t - assert self._load_archive('example-digest.warc', check_digests=False) == expected_f + assert self._load_archive('example-digest-bad.warc', check_digests=True) == expected_t + assert self._load_archive('example-digest-bad.warc', check_digests=False) == expected_f # record 2: b64 digest; record 3: b64 filename safe digest - assert self._load_archive('example-digest.warc', offset=922, check_digests=True) == expected_t - assert self._load_archive('example-digest.warc', offset=922, check_digests=False) == expected_t + assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=True) == expected_t + assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=False) == expected_t diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py index 484b7f0f..24094936 100644 --- a/warcio/archiveiterator.py +++ b/warcio/archiveiterator.py @@ -56,12 +56,13 @@ class ArchiveIterator(six.Iterator): def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE, - check_digests=False): + check_digests=False, fixup_bugs=True): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, - arc2warc=arc2warc) + arc2warc=arc2warc, + fixup_bugs=fixup_bugs) self.known_format = None self.mixed_arc_warc = arc2warc diff --git a/warcio/cli.py b/warcio/cli.py index efdf7c50..ada44f12 100644 --- a/warcio/cli.py +++ b/warcio/cli.py @@ -4,6 +4,8 @@ from warcio.checker import Checker from warcio.extractor import Extractor from warcio.recompressor import Recompressor +from warcio.tester import Tester +from warcio.utils import BUFF_SIZE import sys @@ -51,6 +53,10 @@ def main(args=None): check.add_argument('-v', '--verbose', action='store_true') check.set_defaults(func=checker) + test = subparsers.add_parser('test', help='WARC standards tester') + test.add_argument('inputs', nargs='+') + test.set_defaults(func=tester) + cmd = parser.parse_args(args=args) cmd.func(cmd) @@ -86,6 +92,12 @@ def recompressor(cmd): _recompressor.recompress() +# ============================================================================ +def tester(cmd): + _tester = Tester(cmd) + sys.exit(_tester.process_all()) + + # ============================================================================ if __name__ == "__main__": #pragma: no cover main() diff --git a/warcio/recordloader.py b/warcio/recordloader.py index 05b159df..2f48233b 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -58,7 +58,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, verify_http=True, arc2warc=True): + def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -68,6 +68,7 @@ def __init__(self, verify_http=True, arc2warc=True): self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http) self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) + self.fixup_bugs = fixup_bugs def parse_record_stream(self, stream, statusline=None, @@ -99,7 +100,7 @@ def parse_record_stream(self, stream, elif the_format in ('warc', 'arc2warc'): rec_type = rec_headers.get_header('WARC-Type') - uri = self._ensure_target_uri_format(rec_headers) + uri = self._ensure_target_uri_format(rec_headers, fixup_bugs=self.fixup_bugs) length = rec_headers.get_header('Content-Length') content_type = rec_headers.get_header('Content-Type') if the_format == 'warc': @@ -238,7 +239,7 @@ def _detect_type_load_headers(self, stream, msg = 'Unknown archive format, first line: ' raise ArchiveLoadFailed(msg + str(se.statusline)) - def _ensure_target_uri_format(self, rec_headers): + def _ensure_target_uri_format(self, rec_headers, fixup_bugs=True): """Checks the value for the WARC-Target-URI header field to see if it starts with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present, corrects and updates the field returning the corrected value for the field @@ -251,8 +252,7 @@ def _ensure_target_uri_format(self, rec_headers): :rtype: str | None """ uri = rec_headers.get_header('WARC-Target-URI') - - if uri is not None and uri.startswith('<') and uri.endswith('>'): + if fixup_bugs and uri is not None and uri.startswith('<') and uri.endswith('>'): uri = uri[1:-1] rec_headers.replace_header('WARC-Target-URI', uri) diff --git a/warcio/tester.py b/warcio/tester.py new file mode 100644 index 00000000..800f797e --- /dev/null +++ b/warcio/tester.py @@ -0,0 +1,638 @@ +from __future__ import print_function + +import re +import ipaddress +import sys +import traceback + +from warcio.archiveiterator import WARCIterator + + +class Commentary: + def __init__(self, record_id, rec_type): + self._record_id = record_id + self._rec_type = rec_type + self.errors = [] + self.recommendations = [] + self._comments = [] + + def record_id(self): + return self._record_id + + def rec_type(self): + return self._rec_type + + def error(self, *args): + self.errors.append(args) + + def recommendation(self, *args): + self.recommendations.append(args) + + def comment(self, *args): + self._comments.append(args) + + def has_comments(self): + if self.errors or self.recommendations or self._comments: + return True + + def comments(self): + for e in self.errors: + yield 'error: ' + ' '.join(e) + for r in self.recommendations: + yield 'recommendation: ' + ' '.join(r) + for c in self._comments: + yield 'comment: ' + ' '.join(c) + + +class WrapRecord(object): + def __init__(self, obj): + self.obj = obj + self._content = None + + def __getattr__(self, name): + if name == 'content': + if self._content is None: + self._content = self.obj.content_stream().read() + return self._content + return getattr(self.__dict__['obj'], name) + + +def canon_content_type(s): + return s.lower().replace('; ', ';') + + +def validate_warc_fields(record, commentary): + # warc-fields = *named-field CRLF + # named-field = field-name ":" [ field-value ] + # field-value = *( field-content | LWS ) # LWS signals continuations + # field-name = token # token_re + + content = record.content + try: + text = content.decode('utf-8', errors='strict') + except UnicodeDecodeError as e: + commentary.error('warc-fields contains invalid utf-8: '+str(e)) + text = content.decode('utf-8', errors='replace') + + first_line = True + lines = [] + for line in text.splitlines(True): + if not line.endswith('\r\n'): + commentary.error('warc-fields lines must end with \r\n') + line = line.rstrip('\r\n') + else: + line = line[:-2] + + if line.startswith(' ') or line.startswith('\t'): + if first_line: + commentary.error('The first line of warc-fields cannot start with whitespace') + else: + lines[-1] += ' ' + line[1:] + elif line == '': + # are blank lines prohibited? + pass + else: + # check for field-name : + if ':' not in line: + commentary.error('Missing field-name : in warc-fields line', line) + else: + field_name = line.split(':', 1)[0] + if not re.fullmatch(token_re, field_name): + commentary('invalid warc-fields name', field_name) + else: + lines.append(line) + first_line = False + + # check known fields + + +def validate_warcinfo(record, commentary, pending): + content_type = record.rec_headers.get_header('Content-Type') + if content_type.lower() != 'application/warc-fields': + commentary.recommencation('warcinfo Content-Type of application/warc-fields, saw', content_type) + else: + # format: warc-fields + # allowable fields include but not limited to DMCI plus the following + # operator, software, robots, hostname, ip, http-header-user-agent, http-header-from + # if operator present, recommended name or name and email address + # comment if http-user-agent here and in the request or metadata record? + # comment if http-header-from here and in the request? + validate_warc_fields(record, commentary) + + # whole-file tests: + # optional that warcinfo be first in file, still deserves a comment + # allowable for warcinfo to appear anywhere + + +def validate_response(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + + if target_uri.startswith('http:') or target_uri.startswith('https:'): + content_type = record.rec_headers.get_header('Content-Type') + if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}: + commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw ', content_type) + + if record.rec_headers.get_header('WARC-IP-Address') is None: + commentary.error('WARC-IP-Address should be used for http and https responses') + + # error: http and https schemes should have http response headers + # comment: verify http content-length, if present -- commoncrawl nutch bug + + +def validate_resource(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + + if target_uri.startswith('dns:'): + content_type = record.rec_headers.get_header('Content-Type') + if content_type.lower() != 'text/dns': + commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type) + else: + # rfc 2540 and rfc 1035 + #validate_text_dns() + pass + + # should never have http headers + + +def validate_request(record, commentary, pending): + target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + + if target_uri.startswith('http:') or target_uri.startswith('https:'): + content_type = record.rec_headers.get_header('Content-Type') + + if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}: + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw ', content_type) + + if record.rec_headers.get_header('WARC-IP-Address') is None: + commentary.error('WARC-IP-Address should be used for http and https requests') + + # error: http and https schemes should have http request headers + + # WARC-Concurrent-To field or fields may be used, comment if present but target record is not + + +def validate_metadata(record, commentary, pending): + content_type = record.rec_headers.get_header('Content-Type') + if content_type.lower() == 'application/warc-fields': + # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 + # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it + # hopsFromSeed: string + # fetchTimeMs: time in milliseconds, so it's an integer? + validate_warc_fields(record, commentary) + + +def validate_revisit(record, commentary, pending): + warc_profile = record.rec_headers.get_header('WARC-Profile') + + if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): + config = { + 'required': ['WARC-Payload-Digest'], + 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], + } + validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) + # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired + # recommended that server response headers be preserved "in this manner" + + elif warc_profile.ends_with('/revisit/server-not-modified'): + config = { + 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'], + 'prohibited': ['WARC-Payload-Digest'], + } + validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) + # may have content body; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated if desired + # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present + else: + commentary.comment('no revisit details validation done due to unknown profile') + + +def validate_conversion(record, commentary, pending): + # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment? + # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To + pass + + +def validate_continuation(record, commentary, pending): + commentary.comment('warcio test continuation code has not been tested, expect bugs') + + warc_type = record.rec_headers.get_header('WARC-Type') + if warc_type in {'warcinfo', 'request', 'metadata', 'revisit'}: + commentary.recommendation('do not segment warc-type', warc_type) + + # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated + + +def validate_actual_uri(field, value, record, version, commentary, pending): + # uri per RFC 3986 + # should use a registered scheme + # %XX encoding, normalize to upper case + # schemes are case-insensitive and normalize to lower + if value.startswith('<') or value.endswith('>'): + # wget 1.19 bug caused by WARC 1.0 spec error + commentary.error('uri must not be within <>', field, value) + if ':' not in value: + commentary.error('invalid uri, no scheme', field, value) + if re.search(r'\s', value, re.A): + commentary.error('invalid uri, contains whitespace', field, value) + scheme, rest = value.split(':', 1) + if not re.fullmatch(r'[A-Za-z][A-Za-z0-9+\-\.]*', scheme, re.A): + commentary.error('invalid uri scheme, bad character', field, value) + # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + + +def validate_warc_type(field, value, record, version, commentary, pending): + if not value.islower(): + # I am unclear if this is allowed? standard is silent + commentary.comment('Warc-Type is not lower-case', field, value) + if value.lower() not in record_types: + # standard says readers should ignore unknown warc-types + commentary.comment('unknown Warc-Type', field, value) + + +def validate_uri(field, value, record, version, commentary, pending): + # < uri > + if not (value.startswith('<') and value.endswith('>')): + commentary.error('uri must be within <>', field, value) + return + validate_actual_uri(field, value[1:-1], record, version, commentary, pending) + + +def validate_record_id(field, value, record, version, commentary, pending): + validate_uri(field, value, record, version, commentary, pending) + # TODO: should be "globally unique for its period of intended use" + + +def validate_timestamp(field, value, record, version, commentary, pending): + use_ms = False if version == '1.0' else True + if not use_ms: + if '.' in value: + # XXX specification infelicity: would be nice to have 'advice to implementers' here + commentary.error('WARC 1.0 may not have fractional seconds', field, value) + else: + start, end = value.split('.', 1) + if not re.fullmatch(r'[0-9]{1,9}Z', end, re.A): + commentary.error('fractional seconds must have 1-9 digits', field, value) + + # XXX the above is pretty incomplete for dash, colon, trailing Z, etc + + # TODO: "multiple records written as part of a single capture event shall use the same WARC-Date" + # how? follow WARC-Concurrent-To pointer(s) from request to response(s) + + +def validate_content_length(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('must be an integer', field, value) + + +token_re = r'[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+' +digest_re = r'[A-Za-z0-9/+\-_=]+' + + +def validate_content_type(field, value, record, version, commentary, pending): + if '/' not in value: + commentary.error('must contain a /', field, value) + ctype, rest = value.split('/', 1) + if not re.fullmatch(token_re, ctype, re.A): + commentary.error('invalid type', field, value) + if ';' in rest: + subtype, rest = rest.split(';', 1) + else: + subtype = rest + if not re.fullmatch(token_re, subtype, re.A): + commentary.error('invalid subtype', field, value) + # at this point there can be multiple parameters, + # some of which could have quoted string values with ; in them + # TODO: more checking + + +def validate_digest(field, value, record, version, commentary, pending): + if ':' not in value: + commentary.error('missing algorithm', field, value) + algorithm, digest = value.split(':', 1) + if not re.fullmatch(token_re, algorithm, re.A): + commentary.error('invalid algorithm', field, value) + if not re.fullmatch(token_re, digest, re.A): + # https://github.com/iipc/warc-specifications/issues/48 + # commentary.comment('spec incorrectly says this is an invalid digest', field, value) + pass + if not re.fullmatch(digest_re, digest, re.A): + commentary.comment('Invalid-looking digest value', field, value) + + +def validate_ip(field, value, record, version, commentary, pending): + # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291 + try: + ipaddress.ip_address(value) + except ValueError: + commentary.error('invalid ip', field, value) + + +def validate_truncated(field, value, record, version, commentary, pending): + if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: + commentary.comment('extension seen', field, value) + + +def validate_warcinfo_id(field, value, record, version, commentary, pending): + validate_uri(field, value, record, version, commentary, pending) + # TODO: should point at a warcinfo record + + +def validate_filename(field, value, record, version, commentary, pending): + # TODO: text or quoted-string + pass + + +profiles = { + '1.0': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.1/revisit/server-not-modified', + # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not? + 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'], + '1.1': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.0/revisit/server-not-modified'], +} + + +def validate_profile(field, value, record, version, commentary, pending): + if version not in profiles: + commentary.comment('no profile check because unknown warc version', field, value) + return + if value not in profiles[version]: + commentary.comment('extension seen', field, value) + + +def validate_segment_number(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('must be an integer', field, value) + iv = int(value) + if iv == 0: + commentary.error('must be 1 or greater', field, value) + # TODO: type != continuation must have iv == 1, else iv > 1 + # might make that check in the 'continuation' section? + + +def validate_segment_total_length(field, value, record, version, commentary, pending): + if not value.isdigit(): + commentary.error('must be an integer', field, value) + + +warc_fields = { + 'WARC-Type': { + 'validate': validate_warc_type, + }, + 'WARC-Record-ID': { + 'validate': validate_record_id, + }, + 'WARC-Date': { + 'validate': validate_timestamp, + }, + 'Content-Length': { + 'validate': validate_content_length, + }, + 'Content-Type': { + 'validate': validate_content_type, + }, + 'WARC-Concurrent-To': { + 'validate': validate_uri, + }, + 'WARC-Block-Digest': { + 'validate': validate_digest, # openssl check? or just let check_digest get it? + }, + 'WARC-Payload-Digest': { + 'validate': validate_digest, + }, + 'WARC-IP-Address': { + 'validate': validate_ip, + }, + 'WARC-Refers-To': { + 'validate': validate_uri, + }, + 'WARC-Target-URI': { + 'validate': validate_actual_uri, + }, + 'WARC-Truncated': { + 'validate': validate_truncated, + }, + 'WARC-Warcinfo-ID': { + 'validate': validate_warcinfo_id, + }, + 'WARC-Filename': { + 'validate': validate_filename, + }, + 'WARC-Profile': { + 'validate': validate_profile, + }, + 'WARC-Identified-Payload-Type': { + 'validate': validate_content_type, + }, + 'WARC-Segment-Origin-ID': { + 'validate': validate_uri, + }, + 'WARC-Segment-Number': { + 'validate': validate_segment_number, + }, + 'WARC-Segment-Total-Length': { + 'validate': validate_segment_total_length, + }, + 'WARC-Refers-To-Target-URI': { + 'validate': validate_actual_uri, + 'minver': '1.1', + }, + 'WARC-Refers-To-Date': { + 'validate': validate_timestamp, + 'minver': '1.1', + }, +} +warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()]) + +record_types = { + 'warcinfo': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'], + 'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'], + 'validate': validate_warcinfo, + }, + 'response': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_response, + }, + 'resource': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + }, + 'request': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], + 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_request, + }, + 'metadata': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type'], + 'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated', + 'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'], + 'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_metadata, + }, + 'revisit': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'Content-Type', 'WARC-Target-URI', 'WARC-Profile'], + 'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID', # normal optionals + 'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], # these are for profiles + 'prohibited': ['WARC-Filename'], + 'validate': validate_revisit, + }, + 'conversion': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID'], + 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_conversion, + }, + 'continuation': { + 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', + 'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'], + 'optional': [], + 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_continuation, + }, +} + + +def make_header_set(config, kinds): + ret = set() + for kind in kinds: + ret = ret.union(set([x.lower() for x in config.get(kind, [])])) + return ret + + +def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): + for req in config.get('required', []): + if not rec_headers.get_header(req): + commentary.error('missing required header', req) + for rec in config.get('recommended', []): + if not rec_headers.get_header(rec): + commentary.recommendation('missing recommended header', rec) + allowed = make_header_set(config, ('required', 'optional', 'recommended')) + prohibited = make_header_set(config, ('prohibited',)) + + for field, value in rec_headers.headers: + fl = field.lower() + if fl in prohibited: + commentary.error('field not allowed in record_type', field, rec_type) + elif allow_all or fl in allowed: + pass + elif fl in warc_fields: + commentary.comment('no configuration seen for', field, rec_type) + else: + # an 'unknown field' comment has already been issued in validate_record + pass + + +def validate_record_against_rec_type(config, record, commentary, pending): + if 'validate' in config: + config['validate'](record, commentary, pending) + + +def validate_record(record): + version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported? + + record_id = record.rec_headers.get_header('WARC-Record-ID') + rec_type = record.rec_headers.get_header('WARC-Type') + if record_id is None: + print('no WARC-Record-ID seen, skipping validation', file=sys.stderr) + return + commentary = Commentary(record_id, rec_type) + pending = None + + seen_fields = set() + for field, value in record.rec_headers.headers: + field_case = field + field = field.lower() + if field != 'warc-concurrent-to' and field in seen_fields: + commentary.error('duplicate field seen', field, value) + if field not in warc_fields: + commentary.comment('unknown field, no validation performed', field_case, value) + continue + config = warc_fields[field] + if 'minver' in config: + if version < config['minver']: + # unknown fields are extensions, so this is a comment and not an error + commentary.comment('field was introduced after this warc version', field_case, value, version) + if 'validate' in config: + config['validate'](field, value, record, version, commentary, pending) + + # TODO: validate warc types: unknown should get a comment + if rec_type not in record_types: + commentary.comment('unknown record type, no validation performed', rec_type) + else: + validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary) + validate_record_against_rec_type(record_types[rec_type], record, commentary, pending) + + return commentary + + +def _process_one(warc): + if warc.endswith('.arc') or warc.endswith('.arc.gz'): + return + with open(warc, 'rb') as stream: + for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): + + try: + record = WrapRecord(record) + digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or + record.rec_headers.get_header('WARC-Block-Digest')) + + commentary = validate_record(record) + + record.content # make sure digests are checked + # XXX might need to read and digest the raw stream to check digests for chunked encoding? + # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes + except Exception: + # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code + print('Caught exception in warcio test analysis code') + traceback.print_exc() + exit(1) + + if commentary.has_comments() or record.digest_checker.passed is False: + print(' ', 'WARC-Record-ID', commentary.record_id()) + print(' ', 'WARC-Type', commentary.rec_type()) + + if record.digest_checker.passed is True: + print(' digest pass') + elif record.digest_checker.passed is None: + if digest_present: + print(' digest present but not checked') + else: + print(' digest not present') + for p in record.digest_checker.problems: + print(' ', p) + + if commentary.has_comments(): + for c in commentary.comments(): + print(' ', c) + + +class Tester(object): + def __init__(self, cmd): + self.inputs = cmd.inputs + self.verbose = cmd.verbose + self.exit_value = 0 + + def process_all(self): + for warc in self.inputs: + print(warc) + try: + self.process_one(warc) + except Exception as e: + print(' saw exception '+str(e).rstrip(), file=sys.stderr) + print(' skipping rest of file', file=sys.stderr) + return self.exit_value + + def process_one(self, filename): + _process_one(filename) From 88dff09ee436b1922740d0600ce5e4d50693be4e Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 25 Jan 2019 16:12:21 -0800 Subject: [PATCH 36/68] documentation --- README.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.rst b/README.rst index 9bc16420..ec16452d 100644 --- a/README.rst +++ b/README.rst @@ -368,6 +368,14 @@ of WARC records, if possible. An exit value of 1 indicates a failure. ``warcio check -v`` will print verbose output for each record in the WARC file. +Test +~~~~ + +The ``warcio test`` command will check one or more WARC files against +the WARC standard, giving commentary about standards violations, +recommendations, and other issues. + + Recompress ~~~~~~~~~~ From c99bc2e409c46826d33a9b9111cd6a290ef78bb3 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 25 Jan 2019 16:42:58 -0800 Subject: [PATCH 37/68] tests --- test/test_archiveiterator.py | 2 +- test/test_cli.py | 2 +- warcio/tester.py | 5 +++-- warcio/utils.py | 6 +++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py index 066b53fb..7378c7af 100644 --- a/test/test_archiveiterator.py +++ b/test/test_archiveiterator.py @@ -284,7 +284,7 @@ def test_corrects_wget_bug(self): with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record: assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record: - assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/' + assert record.rec_headers.get('WARC-Target-URI') == '' def test_corrects_space_in_target_uri(self): with self._find_first_by_type('example-space-in-target-uri.warc.gz', 'resource') as record: diff --git a/test/test_cli.py b/test/test_cli.py index 7bdc87f7..be82dab8 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -90,7 +90,7 @@ def test_check_valid(): def test_check_invalid(): - filenames = [get_test_file('example-digest.warc')] + filenames = [get_test_file('example-digest-bad.warc')] args = ['check'] + filenames value = check_helper(args, 1) diff --git a/warcio/tester.py b/warcio/tester.py index 800f797e..de456dc8 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -6,6 +6,7 @@ import traceback from warcio.archiveiterator import WARCIterator +from warcio.utils import to_native_str class Commentary: @@ -69,10 +70,10 @@ def validate_warc_fields(record, commentary): content = record.content try: - text = content.decode('utf-8', errors='strict') + text = to_native_str(content, 'utf-8', errors='strict') except UnicodeDecodeError as e: commentary.error('warc-fields contains invalid utf-8: '+str(e)) - text = content.decode('utf-8', errors='replace') + text = to_native_str(content, 'utf-8', errors='replace') first_line = True lines = [] diff --git a/warcio/utils.py b/warcio/utils.py index 08783f06..fb544cff 100644 --- a/warcio/utils.py +++ b/warcio/utils.py @@ -13,14 +13,14 @@ # #=========================================================================== -def to_native_str(value, encoding='utf-8'): +def to_native_str(value, encoding='utf-8', errors='strict'): if isinstance(value, str): return value if six.PY3 and isinstance(value, six.binary_type): #pragma: no cover - return value.decode(encoding) + return value.decode(encoding, errors) elif six.PY2 and isinstance(value, six.text_type): #pragma: no cover - return value.encode(encoding) + return value.encode(encoding, errors) else: return value From 003933534b54055aae8dcc977ebfabb70c2a5e0a Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 25 Jan 2019 17:03:04 -0800 Subject: [PATCH 38/68] tests --- warcio/tester.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index de456dc8..386586bb 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -1,7 +1,6 @@ from __future__ import print_function import re -import ipaddress import sys import traceback @@ -9,6 +8,14 @@ from warcio.utils import to_native_str +def try_ipaddress_init(): + # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies. + try: + import ipaddress + except ImportError: # pragma: no cover + pass + + class Commentary: def __init__(self, record_id, rec_type): self._record_id = record_id @@ -325,6 +332,8 @@ def validate_ip(field, value, record, version, commentary, pending): ipaddress.ip_address(value) except ValueError: commentary.error('invalid ip', field, value) + except NameError: + commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') def validate_truncated(field, value, record, version, commentary, pending): @@ -622,8 +631,8 @@ def _process_one(warc): class Tester(object): def __init__(self, cmd): self.inputs = cmd.inputs - self.verbose = cmd.verbose self.exit_value = 0 + try_ipaddress_init() def process_all(self): for warc in self.inputs: From 9b7c9ce8af01d6e66fbe17b36e4bec043f746b3e Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 26 Jan 2019 08:39:22 -0800 Subject: [PATCH 39/68] coverage --- test/data/standard-torture-missing.warc | 5 + .../standard-torture-validate-record.warc | 79 ++++++++++ test/test_tests.py | 149 ++++++++++++++++++ warcio/tester.py | 79 ++++++---- 4 files changed, 278 insertions(+), 34 deletions(-) create mode 100644 test/data/standard-torture-missing.warc create mode 100644 test/data/standard-torture-validate-record.warc create mode 100644 test/test_tests.py diff --git a/test/data/standard-torture-missing.warc b/test/data/standard-torture-missing.warc new file mode 100644 index 00000000..a1ab0714 --- /dev/null +++ b/test/data/standard-torture-missing.warc @@ -0,0 +1,5 @@ +WARC/1.0 +WARC-Type: warcinfo +Content-Length: 0 + + diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc new file mode 100644 index 00000000..5181ea38 --- /dev/null +++ b/test/data/standard-torture-validate-record.warc @@ -0,0 +1,79 @@ +WARC/1.0 +WARC-Type: warcinfo +Content-Type: application/warc-fields +Content-Length: 146 + + first line can't start with a space +test: invalid utf8 �( +test: lines should end with \r\n +foo: + bar + +no colon +token cannot have a space: + + +WARC/1.0 +WARC-Type: warcinfo +Content-Type: application/warc-fields +Content-Length: 0 + + +WARC/1.0 +WARC-Type: response +WARC-Target-URI: HtTp://example.com/ +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: resource +WARC-Target-URI: DnS:asdfasdf +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: resource +WARC-Target-URI: DnS:asdfasdf +Content-Type: text/dns +Content-Length: 0 + + +WARC/1.0 +WARC-Type: request +WARC-Target-URI: hTtP://example.com/ +Content-Type: text/plain +Content-Length: 0 + + +WARC/1.0 +WARC-Type: metadata +Content-Type: application/warc-fields +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Profile: none +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest +Content-Length: 0 + + +WARC/1.0 +WARC-Type: revisit +WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified +Content-Length: 0 + + +WARC/1.0 +WARC-Type: continuation +WARC-Segment-Number: 1 +Content-Length: 0 + + diff --git a/test/test_tests.py b/test/test_tests.py new file mode 100644 index 00000000..239d2461 --- /dev/null +++ b/test/test_tests.py @@ -0,0 +1,149 @@ +from warcio.cli import main + +from . import get_test_file +from .test_cli import patch_stdout + + +def helper(args, expected_exit_value): + with patch_stdout() as buff: + exit_value = None + try: + main(args=args) + except SystemExit as e: + exit_value = e.code + finally: + assert exit_value == expected_exit_value + + return buff.getvalue() + + +def remove_before_test_data(s): + ret = b'' + for line in s.splitlines(True): + if b'/test/data/' in line: + line = b'test/data/' + line.split(b'/test/data/', 1)[1] + ret += line + return ret + + +def test_torture_missing(): + files = ['standard-torture-missing.warc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = b"""\ +test/data/standard-torture-missing.warc + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + recommendation: warcinfo Content-Type of application/warc-fields, saw none +""" + + value = helper(args, 0) + assert remove_before_test_data(value) == expected + + +def test_torture_validate_record(): + files = ['standard-torture-validate-record.warc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = b"""\ +test/data/standard-torture-validate-record.warc + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte + comment: The first line of warc-fields cannot start with whitespace + comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n + comment: Missing field-name : in warc-fields line: no colon + comment: invalid warc-fields name: token cannot have a space + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + comment: warc-fields body present but empty + WARC-Record-ID None + WARC-Type response + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain + error: WARC-IP-Address should be used for http and https responses + WARC-Record-ID None + WARC-Type resource + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + WARC-Record-ID None + WARC-Type resource + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + WARC-Record-ID None + WARC-Type request + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID None + WARC-Type metadata + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + comment: warc-fields body present but empty + WARC-Record-ID None + WARC-Type revisit + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI + comment: extension seen warc-profile none + comment: no revisit details validation done due to unknown profile + WARC-Record-ID None + WARC-Type revisit + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI + error: missing required header WARC-Payload-Digest + recommendation: missing recommended header WARC-Refers-To + recommendation: missing recommended header WARC-Refers-To-Date + recommendation: missing recommended header WARC-Refers-To-Target-URI + WARC-Record-ID None + WARC-Type revisit + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI + recommendation: missing recommended header WARC-Refers-To + recommendation: missing recommended header WARC-Refers-To-Date + comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified + WARC-Record-ID None + WARC-Type continuation + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Segment-Origin-ID + error: missing required header WARC-Target-URI + error: continuation record must have WARC-Segment-Number > 1, saw 1 + comment: warcio test continuation code has not been tested, expect bugs +""" + + value = helper(args, 0) + print(remove_before_test_data(value).decode()) + assert remove_before_test_data(value) == expected diff --git a/warcio/tester.py b/warcio/tester.py index 386586bb..bdfe38f0 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -75,10 +75,10 @@ def validate_warc_fields(record, commentary): # field-value = *( field-content | LWS ) # LWS signals continuations # field-name = token # token_re - content = record.content + content = record.content # TESTME try: text = to_native_str(content, 'utf-8', errors='strict') - except UnicodeDecodeError as e: + except UnicodeDecodeError as e: # TESTME commentary.error('warc-fields contains invalid utf-8: '+str(e)) text = to_native_str(content, 'utf-8', errors='replace') @@ -86,14 +86,14 @@ def validate_warc_fields(record, commentary): lines = [] for line in text.splitlines(True): if not line.endswith('\r\n'): - commentary.error('warc-fields lines must end with \r\n') + commentary.comment('warc-fields lines must end with \\r\\n:', line.rstrip()) line = line.rstrip('\r\n') else: line = line[:-2] if line.startswith(' ') or line.startswith('\t'): if first_line: - commentary.error('The first line of warc-fields cannot start with whitespace') + commentary.comment('The first line of warc-fields cannot start with whitespace') else: lines[-1] += ' ' + line[1:] elif line == '': @@ -102,22 +102,26 @@ def validate_warc_fields(record, commentary): else: # check for field-name : if ':' not in line: - commentary.error('Missing field-name : in warc-fields line', line) + commentary.comment('Missing field-name : in warc-fields line:', line) else: field_name = line.split(':', 1)[0] if not re.fullmatch(token_re, field_name): - commentary('invalid warc-fields name', field_name) + commentary.comment('invalid warc-fields name:', field_name) else: lines.append(line) first_line = False + if not lines: + commentary.comment('warc-fields body present but empty') + return + # check known fields def validate_warcinfo(record, commentary, pending): - content_type = record.rec_headers.get_header('Content-Type') + content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': - commentary.recommencation('warcinfo Content-Type of application/warc-fields, saw', content_type) + commentary.recommendation('warcinfo Content-Type of application/warc-fields, saw', content_type) else: # format: warc-fields # allowable fields include but not limited to DMCI plus the following @@ -133,25 +137,27 @@ def validate_warcinfo(record, commentary, pending): def validate_response(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() # TESTME if target_uri.startswith('http:') or target_uri.startswith('https:'): - content_type = record.rec_headers.get_header('Content-Type') + content_type = record.rec_headers.get_header('Content-Type', 'none') if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}: - commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw ', content_type) + commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') # error: http and https schemes should have http response headers + # test by attempting to parse them? + # comment: verify http content-length, if present -- commoncrawl nutch bug def validate_resource(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower() # TESTME if target_uri.startswith('dns:'): - content_type = record.rec_headers.get_header('Content-Type') + content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'text/dns': commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type) else: @@ -163,13 +169,13 @@ def validate_resource(record, commentary, pending): def validate_request(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI').lower() + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() # TESTME if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type') if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}: - commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw ', content_type) + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https requests') @@ -180,7 +186,7 @@ def validate_request(record, commentary, pending): def validate_metadata(record, commentary, pending): - content_type = record.rec_headers.get_header('Content-Type') + content_type = record.rec_headers.get_header('Content-Type', 'none') # TESTME if content_type.lower() == 'application/warc-fields': # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it @@ -190,7 +196,7 @@ def validate_metadata(record, commentary, pending): def validate_revisit(record, commentary, pending): - warc_profile = record.rec_headers.get_header('WARC-Profile') + warc_profile = record.rec_headers.get_header('WARC-Profile', 'none') # TESTME if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): config = { @@ -201,7 +207,7 @@ def validate_revisit(record, commentary, pending): # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired # recommended that server response headers be preserved "in this manner" - elif warc_profile.ends_with('/revisit/server-not-modified'): + elif warc_profile.endswith('/revisit/server-not-modified'): config = { 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'], 'prohibited': ['WARC-Payload-Digest'], @@ -216,15 +222,15 @@ def validate_revisit(record, commentary, pending): def validate_conversion(record, commentary, pending): # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment? # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To - pass + pass # TESTME def validate_continuation(record, commentary, pending): - commentary.comment('warcio test continuation code has not been tested, expect bugs') + commentary.comment('warcio test continuation code has not been tested, expect bugs') # TESTME - warc_type = record.rec_headers.get_header('WARC-Type') - if warc_type in {'warcinfo', 'request', 'metadata', 'revisit'}: - commentary.recommendation('do not segment warc-type', warc_type) + segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') + if segment_number.isdigit() and int(segment_number) < 2: + commentary.error('continuation record must have WARC-Segment-Number > 1, saw', segment_number) # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated @@ -234,7 +240,7 @@ def validate_actual_uri(field, value, record, version, commentary, pending): # should use a registered scheme # %XX encoding, normalize to upper case # schemes are case-insensitive and normalize to lower - if value.startswith('<') or value.endswith('>'): + if value.startswith('<') or value.endswith('>'): # TESTME # wget 1.19 bug caused by WARC 1.0 spec error commentary.error('uri must not be within <>', field, value) if ':' not in value: @@ -250,10 +256,10 @@ def validate_actual_uri(field, value, record, version, commentary, pending): def validate_warc_type(field, value, record, version, commentary, pending): if not value.islower(): # I am unclear if this is allowed? standard is silent - commentary.comment('Warc-Type is not lower-case', field, value) + commentary.comment('WARC-Type is not lower-case', field, value) if value.lower() not in record_types: # standard says readers should ignore unknown warc-types - commentary.comment('unknown Warc-Type', field, value) + commentary.comment('unknown WARC-Type', field, value) def validate_uri(field, value, record, version, commentary, pending): @@ -307,8 +313,10 @@ def validate_content_type(field, value, record, version, commentary, pending): subtype = rest if not re.fullmatch(token_re, subtype, re.A): commentary.error('invalid subtype', field, value) + # at this point there can be multiple parameters, # some of which could have quoted string values with ; in them + # TODO: more checking @@ -372,11 +380,17 @@ def validate_profile(field, value, record, version, commentary, pending): def validate_segment_number(field, value, record, version, commentary, pending): if not value.isdigit(): commentary.error('must be an integer', field, value) + return iv = int(value) if iv == 0: commentary.error('must be 1 or greater', field, value) - # TODO: type != continuation must have iv == 1, else iv > 1 - # might make that check in the 'continuation' section? + + rec_type = record.rec_headers.get_header('WARC-Type', 'none') + if rec_type != 'continuation': + if iv != 1: + commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value) + elif rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: + commentary.recommendation('do not segment warc-type', warc_type) def validate_segment_total_length(field, value, record, version, commentary, pending): @@ -507,7 +521,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'continuation': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'], - 'optional': [], + 'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'], 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_continuation, }, @@ -522,10 +536,10 @@ def make_header_set(config, kinds): def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): - for req in config.get('required', []): + for req in sorted(config.get('required', [])): if not rec_headers.get_header(req): commentary.error('missing required header', req) - for rec in config.get('recommended', []): + for rec in sorted(config.get('recommended', [])): if not rec_headers.get_header(rec): commentary.recommendation('missing recommended header', rec) allowed = make_header_set(config, ('required', 'optional', 'recommended')) @@ -554,9 +568,6 @@ def validate_record(record): record_id = record.rec_headers.get_header('WARC-Record-ID') rec_type = record.rec_headers.get_header('WARC-Type') - if record_id is None: - print('no WARC-Record-ID seen, skipping validation', file=sys.stderr) - return commentary = Commentary(record_id, rec_type) pending = None From 903ed1d9f0da0458dcfa9e2e055de7846b4bf13d Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 26 Jan 2019 08:46:10 -0800 Subject: [PATCH 40/68] python 2.7 test fix --- warcio/tester.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index bdfe38f0..b74a3b03 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -105,7 +105,7 @@ def validate_warc_fields(record, commentary): commentary.comment('Missing field-name : in warc-fields line:', line) else: field_name = line.split(':', 1)[0] - if not re.fullmatch(token_re, field_name): + if not re.search(token_re, field_name): commentary.comment('invalid warc-fields name:', field_name) else: lines.append(line) @@ -248,7 +248,7 @@ def validate_actual_uri(field, value, record, version, commentary, pending): if re.search(r'\s', value, re.A): commentary.error('invalid uri, contains whitespace', field, value) scheme, rest = value.split(':', 1) - if not re.fullmatch(r'[A-Za-z][A-Za-z0-9+\-\.]*', scheme, re.A): + if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme, re.A): commentary.error('invalid uri scheme, bad character', field, value) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml @@ -283,7 +283,7 @@ def validate_timestamp(field, value, record, version, commentary, pending): commentary.error('WARC 1.0 may not have fractional seconds', field, value) else: start, end = value.split('.', 1) - if not re.fullmatch(r'[0-9]{1,9}Z', end, re.A): + if not re.search(r'\A[0-9]{1,9}Z\Z', end, re.A): commentary.error('fractional seconds must have 1-9 digits', field, value) # XXX the above is pretty incomplete for dash, colon, trailing Z, etc @@ -297,21 +297,21 @@ def validate_content_length(field, value, record, version, commentary, pending): commentary.error('must be an integer', field, value) -token_re = r'[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+' -digest_re = r'[A-Za-z0-9/+\-_=]+' +token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z' +digest_re = r'\A[A-Za-z0-9/+\-_=]+\Z' def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: commentary.error('must contain a /', field, value) ctype, rest = value.split('/', 1) - if not re.fullmatch(token_re, ctype, re.A): + if not re.search(token_re, ctype, re.A): commentary.error('invalid type', field, value) if ';' in rest: subtype, rest = rest.split(';', 1) else: subtype = rest - if not re.fullmatch(token_re, subtype, re.A): + if not re.search(token_re, subtype, re.A): commentary.error('invalid subtype', field, value) # at this point there can be multiple parameters, @@ -324,13 +324,13 @@ def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: commentary.error('missing algorithm', field, value) algorithm, digest = value.split(':', 1) - if not re.fullmatch(token_re, algorithm, re.A): + if not re.search(token_re, algorithm, re.A): commentary.error('invalid algorithm', field, value) - if not re.fullmatch(token_re, digest, re.A): + if not re.search(token_re, digest, re.A): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) pass - if not re.fullmatch(digest_re, digest, re.A): + if not re.search(digest_re, digest, re.A): commentary.comment('Invalid-looking digest value', field, value) From 68938bdce2de2180a204a600511e0a5242c5142a Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 26 Jan 2019 08:51:08 -0800 Subject: [PATCH 41/68] python 2.7 fixes --- warcio/tester.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index b74a3b03..c978a404 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -245,10 +245,10 @@ def validate_actual_uri(field, value, record, version, commentary, pending): commentary.error('uri must not be within <>', field, value) if ':' not in value: commentary.error('invalid uri, no scheme', field, value) - if re.search(r'\s', value, re.A): + if re.search(r'\s', value): commentary.error('invalid uri, contains whitespace', field, value) scheme, rest = value.split(':', 1) - if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme, re.A): + if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): commentary.error('invalid uri scheme, bad character', field, value) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml @@ -283,7 +283,7 @@ def validate_timestamp(field, value, record, version, commentary, pending): commentary.error('WARC 1.0 may not have fractional seconds', field, value) else: start, end = value.split('.', 1) - if not re.search(r'\A[0-9]{1,9}Z\Z', end, re.A): + if not re.search(r'\A[0-9]{1,9}Z\Z', end): commentary.error('fractional seconds must have 1-9 digits', field, value) # XXX the above is pretty incomplete for dash, colon, trailing Z, etc @@ -305,13 +305,13 @@ def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: commentary.error('must contain a /', field, value) ctype, rest = value.split('/', 1) - if not re.search(token_re, ctype, re.A): + if not re.search(token_re, ctype): commentary.error('invalid type', field, value) if ';' in rest: subtype, rest = rest.split(';', 1) else: subtype = rest - if not re.search(token_re, subtype, re.A): + if not re.search(token_re, subtype): commentary.error('invalid subtype', field, value) # at this point there can be multiple parameters, @@ -324,13 +324,13 @@ def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: commentary.error('missing algorithm', field, value) algorithm, digest = value.split(':', 1) - if not re.search(token_re, algorithm, re.A): + if not re.search(token_re, algorithm): commentary.error('invalid algorithm', field, value) - if not re.search(token_re, digest, re.A): + if not re.search(token_re, digest): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) pass - if not re.search(digest_re, digest, re.A): + if not re.search(digest_re, digest): commentary.comment('Invalid-looking digest value', field, value) From 234468a5f36176b134c71d47437865e7341e49d2 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 26 Jan 2019 11:08:05 -0800 Subject: [PATCH 42/68] coverage --- .../data/standard-torture-validate-field.warc | 52 ++++++++ .../standard-torture-validate-record.warc | 5 + test/test_tests.py | 123 +++++++++++++++++- warcio/tester.py | 73 +++++++---- 4 files changed, 219 insertions(+), 34 deletions(-) create mode 100644 test/data/standard-torture-validate-field.warc diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc new file mode 100644 index 00000000..2c28d72d --- /dev/null +++ b/test/data/standard-torture-validate-field.warc @@ -0,0 +1,52 @@ +WARC/1.0 +WARC-Target-URI: +WARC-Target-URI: example.com +WARC-Target-URI: ex ample.com +WARC-Target-URI: h<>ttp://example.com/ +WARC-Type: does-not-exist +WARC-Type: CAPITALIZED +WARC-Concurrent-To: http://example.com/ +WARC-Record-ID: +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +Content-Type: asdf +Content-Type: has space/asdf +Content-Type: asdf/has space +Content-Type: asdf/has space;asdf +WARC-Block-Digest: asdf +WARC-Block-Digest: has space:asdf +WARC-Block-Digest: sha1:&$*^&*^#*&^ +WARC-IP-Address: 1.2.3.4.5 +WARC-Truncated: invalid +WARC-Warcinfo-ID: asdf:asdf +WARC-Filename: not-yet-tested +WARC-Profile: asdf +WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest +WARC-Identified-Payload-Type: asdf +WARC-Segment-Origin-ID: http://example.com +WARC-Segment-Number: not-an-integer +WARC-Segment-Number: 0 +WARC-Segment-Number: 1 +WARC-Segment-Number: 2 +WARC-Segment-Total-Length: 0 +WARC-Segment-Total-Length: not-an-integer +WARC-Refers-To-Target-URI: http://example.com +WARC-Refers-To-Date: not-a-date +WARC-Unknown-Field: asdf +Content-Length: 0 + + +WARC/1.1 +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +WARC-Type: invalid +Content-Length: 0 + + +WARC/1.1 +WARC-Type: request +WARC-Segment-Number: 1 +Content-Length: 0 + + +WARC/invalid diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index 5181ea38..d212f370 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -71,6 +71,11 @@ WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified Content-Length: 0 +WARC/1.0 +WARC-Type: conversion +Content-Length: 0 + + WARC/1.0 WARC-Type: continuation WARC-Segment-Number: 1 diff --git a/test/test_tests.py b/test/test_tests.py index 239d2461..19b7e377 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -2,6 +2,7 @@ from . import get_test_file from .test_cli import patch_stdout +from warcio.utils import to_native_str def helper(args, expected_exit_value): @@ -14,14 +15,14 @@ def helper(args, expected_exit_value): finally: assert exit_value == expected_exit_value - return buff.getvalue() + return to_native_str(buff.getvalue()) def remove_before_test_data(s): - ret = b'' + ret = '' for line in s.splitlines(True): - if b'/test/data/' in line: - line = b'test/data/' + line.split(b'/test/data/', 1)[1] + if '/test/data/' in line: + line = 'test/data/' + line.split('/test/data/', 1)[1] ret += line return ret @@ -33,7 +34,7 @@ def test_torture_missing(): args = ['test'] args.extend(files) - expected = b"""\ + expected = """\ test/data/standard-torture-missing.warc WARC-Record-ID None WARC-Type warcinfo @@ -55,7 +56,7 @@ def test_torture_validate_record(): args = ['test'] args.extend(files) - expected = b"""\ + expected = """\ test/data/standard-torture-validate-record.warc WARC-Record-ID None WARC-Type warcinfo @@ -85,6 +86,7 @@ def test_torture_validate_record(): digest not present error: missing required header WARC-Date error: missing required header WARC-Record-ID + error: recource records for dns: shall have Content-Type of text/dns, saw text/plain WARC-Record-ID None WARC-Type resource digest not present @@ -133,6 +135,12 @@ def test_torture_validate_record(): recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified + WARC-Record-ID None + WARC-Type conversion + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI WARC-Record-ID None WARC-Type continuation digest not present @@ -145,5 +153,106 @@ def test_torture_validate_record(): """ value = helper(args, 0) - print(remove_before_test_data(value).decode()) + print(remove_before_test_data(value)) + assert remove_before_test_data(value) == expected + + +def test_torture_validate_field(): + files = ['standard-torture-validate-field.warc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/standard-torture-validate-field.warc + WARC-Record-ID + WARC-Type does-not-exist + unknown hash algorithm name in block digest + error: uri must not be within <> warc-target-uri + error: invalid uri scheme, bad character warc-target-uri + error: duplicate field seen warc-target-uri example.com + error: invalid uri, no scheme warc-target-uri example.com + error: duplicate field seen warc-target-uri ex ample.com + error: invalid uri, no scheme warc-target-uri ex ample.com + error: invalid uri, contains whitespace warc-target-uri ex ample.com + error: invalid uri scheme, bad character warc-target-uri ex ample.com + error: duplicate field seen warc-target-uri h<>ttp://example.com/ + error: invalid uri scheme, bad character warc-target-uri h<>ttp://example.com/ + error: duplicate field seen warc-type CAPITALIZED + error: uri must be within <> warc-concurrent-to http://example.com/ + error: duplicate field seen warc-date 2017-03-06T04:03:53.Z + error: WARC 1.0 may not have fractional seconds warc-date 2017-03-06T04:03:53.Z + error: must contain a / content-type asdf + error: invalid subtype content-type asdf + error: duplicate field seen content-type has space/asdf + error: invalid type content-type has space/asdf + error: duplicate field seen content-type asdf/has space + error: invalid subtype content-type asdf/has space + error: duplicate field seen content-type asdf/has space;asdf + error: invalid subtype content-type asdf/has space;asdf + error: missing algorithm warc-block-digest asdf + error: duplicate field seen warc-block-digest has space:asdf + error: invalid algorithm warc-block-digest has space:asdf + error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^ + error: uri must be within <> warc-warcinfo-id asdf:asdf + error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: must contain a / warc-identified-payload-type asdf + error: invalid subtype warc-identified-payload-type asdf + error: uri must be within <> warc-segment-origin-id http://example.com + error: must be an integer warc-segment-number not-an-integer + error: duplicate field seen warc-segment-number 0 + error: must be 1 or greater warc-segment-number 0 + error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 0 + error: duplicate field seen warc-segment-number 1 + error: duplicate field seen warc-segment-number 2 + error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 2 + error: duplicate field seen warc-segment-total-length not-an-integer + error: must be an integer warc-segment-total-length not-an-integer + comment: unknown WARC-Type warc-type does-not-exist + comment: WARC-Type is not lower-case warc-type CAPITALIZED + comment: unknown WARC-Type warc-type CAPITALIZED + comment: unknown digest algorithm warc-block-digest asdf + comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^ + comment: did not check ip address format, install ipaddress module from pypi if you care + comment: extension seen warc-truncated invalid + comment: extension seen warc-profile asdf + comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0 + comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0 + comment: unknown field, no validation performed WARC-Unknown-Field asdf + WARC-Record-ID None + WARC-Type invalid + digest not present + error: duplicate field seen warc-date 2017-03-06T04:03:53.Z + error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z + comment: unknown WARC-Type warc-type invalid + WARC-Record-ID None + WARC-Type request + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Target-URI + recommendation: do not segment WARC-Type request + comment: no configuration seen for WARC-Segment-Number request +""" + + value = helper(args, 0) + print(remove_before_test_data(value)) + assert remove_before_test_data(value) == expected + + +def test_arc(): + files = ['does-not-exist.arc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/does-not-exist.arc +""" + + value = helper(args, 0) assert remove_before_test_data(value) == expected diff --git a/warcio/tester.py b/warcio/tester.py index c978a404..4c2f8299 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -5,15 +5,15 @@ import traceback from warcio.archiveiterator import WARCIterator -from warcio.utils import to_native_str +from warcio.utils import to_native_str, Digester -def try_ipaddress_init(): +def try_ipaddress_import(): # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies. try: import ipaddress except ImportError: # pragma: no cover - pass + print('ipaddress module not imported') class Commentary: @@ -75,10 +75,10 @@ def validate_warc_fields(record, commentary): # field-value = *( field-content | LWS ) # LWS signals continuations # field-name = token # token_re - content = record.content # TESTME + content = record.content try: text = to_native_str(content, 'utf-8', errors='strict') - except UnicodeDecodeError as e: # TESTME + except UnicodeDecodeError as e: commentary.error('warc-fields contains invalid utf-8: '+str(e)) text = to_native_str(content, 'utf-8', errors='replace') @@ -137,7 +137,7 @@ def validate_warcinfo(record, commentary, pending): def validate_response(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() # TESTME + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type', 'none') @@ -154,7 +154,7 @@ def validate_response(record, commentary, pending): def validate_resource(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower() # TESTME + target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower() if target_uri.startswith('dns:'): content_type = record.rec_headers.get_header('Content-Type', 'none') @@ -169,7 +169,7 @@ def validate_resource(record, commentary, pending): def validate_request(record, commentary, pending): - target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() # TESTME + target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower() if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type') @@ -186,7 +186,7 @@ def validate_request(record, commentary, pending): def validate_metadata(record, commentary, pending): - content_type = record.rec_headers.get_header('Content-Type', 'none') # TESTME + content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() == 'application/warc-fields': # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it @@ -196,7 +196,7 @@ def validate_metadata(record, commentary, pending): def validate_revisit(record, commentary, pending): - warc_profile = record.rec_headers.get_header('WARC-Profile', 'none') # TESTME + warc_profile = record.rec_headers.get_header('WARC-Profile', 'none') if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): config = { @@ -222,11 +222,11 @@ def validate_revisit(record, commentary, pending): def validate_conversion(record, commentary, pending): # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment? # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To - pass # TESTME + pass def validate_continuation(record, commentary, pending): - commentary.comment('warcio test continuation code has not been tested, expect bugs') # TESTME + commentary.comment('warcio test continuation code has not been tested, expect bugs') segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') if segment_number.isdigit() and int(segment_number) < 2: @@ -240,14 +240,14 @@ def validate_actual_uri(field, value, record, version, commentary, pending): # should use a registered scheme # %XX encoding, normalize to upper case # schemes are case-insensitive and normalize to lower - if value.startswith('<') or value.endswith('>'): # TESTME + if value.startswith('<') or value.endswith('>'): # wget 1.19 bug caused by WARC 1.0 spec error commentary.error('uri must not be within <>', field, value) if ':' not in value: commentary.error('invalid uri, no scheme', field, value) if re.search(r'\s', value): commentary.error('invalid uri, contains whitespace', field, value) - scheme, rest = value.split(':', 1) + scheme = value.split(':', 1)[0] if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): commentary.error('invalid uri scheme, bad character', field, value) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml @@ -282,9 +282,10 @@ def validate_timestamp(field, value, record, version, commentary, pending): # XXX specification infelicity: would be nice to have 'advice to implementers' here commentary.error('WARC 1.0 may not have fractional seconds', field, value) else: - start, end = value.split('.', 1) - if not re.search(r'\A[0-9]{1,9}Z\Z', end): - commentary.error('fractional seconds must have 1-9 digits', field, value) + if '.' in value: + start, end = value.split('.', 1) + if not re.search(r'\A[0-9]{1,9}Z\Z', end): + commentary.error('fractional seconds must have 1-9 digits', field, value) # XXX the above is pretty incomplete for dash, colon, trailing Z, etc @@ -304,7 +305,12 @@ def validate_content_length(field, value, record, version, commentary, pending): def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: commentary.error('must contain a /', field, value) - ctype, rest = value.split('/', 1) + splits = value.split('/', 1) + ctype = splits[0] + if len(splits) > 1: + rest = splits[1] + else: + rest = '' if not re.search(token_re, ctype): commentary.error('invalid type', field, value) if ';' in rest: @@ -323,9 +329,19 @@ def validate_content_type(field, value, record, version, commentary, pending): def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: commentary.error('missing algorithm', field, value) - algorithm, digest = value.split(':', 1) + splits = value.split(':', 1) + algorithm = splits[0] + if len(splits) > 1: + digest = splits[1] + else: + digest = 'none' if not re.search(token_re, algorithm): commentary.error('invalid algorithm', field, value) + else: + try: + Digester(algorithm) + except ValueError: + commentary.comment('unknown digest algorithm', field, value) if not re.search(token_re, digest): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) @@ -389,8 +405,8 @@ def validate_segment_number(field, value, record, version, commentary, pending): if rec_type != 'continuation': if iv != 1: commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value) - elif rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: - commentary.recommendation('do not segment warc-type', warc_type) + if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: + commentary.recommendation('do not segment WARC-Type', rec_type) def validate_segment_total_length(field, value, record, version, commentary, pending): @@ -418,7 +434,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_uri, }, 'WARC-Block-Digest': { - 'validate': validate_digest, # openssl check? or just let check_digest get it? + 'validate': validate_digest, }, 'WARC-Payload-Digest': { 'validate': validate_digest, @@ -487,6 +503,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'validate': validate_resource, }, 'request': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', @@ -577,6 +594,7 @@ def validate_record(record): field = field.lower() if field != 'warc-concurrent-to' and field in seen_fields: commentary.error('duplicate field seen', field, value) + seen_fields.add(field) if field not in warc_fields: commentary.comment('unknown field, no validation performed', field_case, value) continue @@ -588,9 +606,8 @@ def validate_record(record): if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) - # TODO: validate warc types: unknown should get a comment if rec_type not in record_types: - commentary.comment('unknown record type, no validation performed', rec_type) + pass # we print a comment for this elsewhere else: validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary) validate_record_against_rec_type(record_types[rec_type], record, commentary, pending) @@ -614,7 +631,7 @@ def _process_one(warc): record.content # make sure digests are checked # XXX might need to read and digest the raw stream to check digests for chunked encoding? # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes - except Exception: + except Exception: # pragma: no cover # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code print('Caught exception in warcio test analysis code') traceback.print_exc() @@ -643,7 +660,6 @@ class Tester(object): def __init__(self, cmd): self.inputs = cmd.inputs self.exit_value = 0 - try_ipaddress_init() def process_all(self): for warc in self.inputs: @@ -651,9 +667,12 @@ def process_all(self): try: self.process_one(warc) except Exception as e: - print(' saw exception '+str(e).rstrip(), file=sys.stderr) + print(' saw exception '+repr(e).rstrip(), file=sys.stderr) print(' skipping rest of file', file=sys.stderr) return self.exit_value def process_one(self, filename): _process_one(filename) + + +try_ipaddress_import() From e7f88e7183fac1dda31d8ad783d69f82da046e9c Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 27 Jan 2019 15:25:36 -0800 Subject: [PATCH 43/68] py2 testing --- test/test_tests.py | 23 ++++++++++++++++++++--- warcio/tester.py | 14 ++------------ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index 19b7e377..a197c3ba 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -1,8 +1,10 @@ +import six + from warcio.cli import main +from warcio.utils import to_native_str from . import get_test_file from .test_cli import patch_stdout -from warcio.utils import to_native_str def helper(args, expected_exit_value): @@ -154,7 +156,13 @@ def test_torture_validate_record(): value = helper(args, 0) print(remove_before_test_data(value)) - assert remove_before_test_data(value) == expected + + ret = remove_before_test_data(value) + + if six.PY2: + expected = expected.replace('\n error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n') + + assert ret == expected def test_torture_validate_field(): @@ -195,6 +203,7 @@ def test_torture_validate_field(): error: duplicate field seen warc-block-digest has space:asdf error: invalid algorithm warc-block-digest has space:asdf error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^ + error: invalid ip warc-ip-address 1.2.3.4.5 error: uri must be within <> warc-warcinfo-id asdf:asdf error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest error: must contain a / warc-identified-payload-type asdf @@ -214,7 +223,6 @@ def test_torture_validate_field(): comment: unknown WARC-Type warc-type CAPITALIZED comment: unknown digest algorithm warc-block-digest asdf comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^ - comment: did not check ip address format, install ipaddress module from pypi if you care comment: extension seen warc-truncated invalid comment: extension seen warc-profile asdf comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest @@ -240,6 +248,15 @@ def test_torture_validate_field(): value = helper(args, 0) print(remove_before_test_data(value)) + + ret = remove_before_test_data(value) + if six.PY2: + if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret: + # user did not install ipaddress module + expected = expected.replace('\n error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n') + ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') + + assert remove_before_test_data(value) == expected diff --git a/warcio/tester.py b/warcio/tester.py index 4c2f8299..308f35fd 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -8,14 +8,6 @@ from warcio.utils import to_native_str, Digester -def try_ipaddress_import(): - # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies. - try: - import ipaddress - except ImportError: # pragma: no cover - print('ipaddress module not imported') - - class Commentary: def __init__(self, record_id, rec_type): self._record_id = record_id @@ -353,10 +345,11 @@ def validate_digest(field, value, record, version, commentary, pending): def validate_ip(field, value, record, version, commentary, pending): # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291 try: + import ipaddress ipaddress.ip_address(value) except ValueError: commentary.error('invalid ip', field, value) - except NameError: + except (ImportError, NameError): commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') @@ -673,6 +666,3 @@ def process_all(self): def process_one(self, filename): _process_one(filename) - - -try_ipaddress_import() From 86620731b78187270b8d77afb378a932ed6d3843 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 27 Jan 2019 15:35:30 -0800 Subject: [PATCH 44/68] py2 windows testing --- test/test_tests.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index a197c3ba..01e72ef4 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -25,6 +25,8 @@ def remove_before_test_data(s): for line in s.splitlines(True): if '/test/data/' in line: line = 'test/data/' + line.split('/test/data/', 1)[1] + if '\\test\\data\\' in line: + line = 'test/data/' + line.split('\\test\\data\\', 1)[1] ret += line return ret @@ -247,17 +249,16 @@ def test_torture_validate_field(): """ value = helper(args, 0) - print(remove_before_test_data(value)) - ret = remove_before_test_data(value) + if six.PY2: if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret: # user did not install ipaddress module expected = expected.replace('\n error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n') ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') - - assert remove_before_test_data(value) == expected + print(ret) + assert ret == expected def test_arc(): From 291460e1970f61173109567ff00b1fe8d3452081 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 27 Jan 2019 23:11:46 -0800 Subject: [PATCH 45/68] coverage --- .../standard-torture-validate-record.warc | 1 + test/test_tests.py | 55 ++++++++++++++++++- warcio/tester.py | 52 +++++++++--------- 3 files changed, 80 insertions(+), 28 deletions(-) diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index d212f370..08a39e50 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -1,6 +1,7 @@ WARC/1.0 WARC-Type: warcinfo Content-Type: application/warc-fields +WARC-Refers-To: probhibited Content-Length: 146 first line can't start with a space diff --git a/test/test_tests.py b/test/test_tests.py index 01e72ef4..0fdecc74 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -2,6 +2,7 @@ from warcio.cli import main from warcio.utils import to_native_str +import warcio.tester from . import get_test_file from .test_cli import patch_stdout @@ -65,8 +66,10 @@ def test_torture_validate_record(): WARC-Record-ID None WARC-Type warcinfo digest not present + error: uri must be within <> warc-refers-to probhibited error: missing required header WARC-Date error: missing required header WARC-Record-ID + error: field not allowed in record_type WARC-Refers-To warcinfo error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte comment: The first line of warc-fields cannot start with whitespace comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n @@ -129,6 +132,7 @@ def test_torture_validate_record(): recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date recommendation: missing recommended header WARC-Refers-To-Target-URI + comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest WARC-Record-ID None WARC-Type revisit digest not present @@ -138,7 +142,6 @@ def test_torture_validate_record(): error: missing required header WARC-Target-URI recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date - comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified WARC-Record-ID None WARC-Type conversion digest not present @@ -227,7 +230,6 @@ def test_torture_validate_field(): comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^ comment: extension seen warc-truncated invalid comment: extension seen warc-profile asdf - comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0 comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0 comment: unknown field, no validation performed WARC-Unknown-Field asdf @@ -274,3 +276,52 @@ def test_arc(): value = helper(args, 0) assert remove_before_test_data(value) == expected + + +def test_digests(): + # needed for test coverage + files = ['example-digest-bad.warc'] + files = [get_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/example-digest-bad.warc + WARC-Record-ID + WARC-Type request + payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests +""" + + value = helper(args, 0) + assert remove_before_test_data(value) == expected + + +def test_leftovers(): + commentary = warcio.tester.Commentary('id', 'type') + + # hard to test because invalid WARC Content-Length raises in archiveiterator + warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None) + + # hard to test because warcio checks the WARC version + warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) + + expected = '''\ +error: must be an integer content-length not-an-integer +comment: no profile check because unknown warc version blah blah +''' + + assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/warcio/tester.py b/warcio/tester.py index 308f35fd..de9f3ca1 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -2,10 +2,10 @@ import re import sys -import traceback from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester +from warcio.exceptions import ArchiveLoadFailed class Commentary: @@ -196,8 +196,11 @@ def validate_revisit(record, commentary, pending): 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], } validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) - # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired - # recommended that server response headers be preserved "in this manner" + # may have record block; + # if not, shall have Content-Length: 0, + # if yes, should be like a response record, truncated FOR LENGTH ONLY if desired + # recommended that server response headers be preserved "in this manner" + # I suppose that means headers are required if there is any content?! elif warc_profile.endswith('/revisit/server-not-modified'): config = { @@ -205,7 +208,9 @@ def validate_revisit(record, commentary, pending): 'prohibited': ['WARC-Payload-Digest'], } validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) - # may have content body; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated if desired + # may have content body; + # if not, shall have Content-Length: 0, + # if yes, should be like a response record, truncated if desired # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present else: commentary.comment('no revisit details validation done due to unknown profile') @@ -343,13 +348,12 @@ def validate_digest(field, value, record, version, commentary, pending): def validate_ip(field, value, record, version, commentary, pending): - # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291 try: import ipaddress ipaddress.ip_address(value) except ValueError: commentary.error('invalid ip', field, value) - except (ImportError, NameError): + except (ImportError, NameError): # pragma: no cover (for python 2.7) commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') @@ -369,12 +373,14 @@ def validate_filename(field, value, record, version, commentary, pending): profiles = { - '1.0': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', - 'http://netpreserve.org/warc/1.1/revisit/server-not-modified', + # XXX WARC/0.17 and WARC/0.18 + '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.0/revisit/server-not-modified', # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not? + # https://github.com/iipc/webarchive-commons/commits/988bec707c27a01333becfc3bd502af4441ea1e1/src/main/java/org/archive/format/warc/WARCConstants.java 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'], - '1.1': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', - 'http://netpreserve.org/warc/1.0/revisit/server-not-modified'], + '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/1.1/revisit/server-not-modified'], } @@ -614,21 +620,15 @@ def _process_one(warc): with open(warc, 'rb') as stream: for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): - try: - record = WrapRecord(record) - digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or - record.rec_headers.get_header('WARC-Block-Digest')) + record = WrapRecord(record) + digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or + record.rec_headers.get_header('WARC-Block-Digest')) - commentary = validate_record(record) + commentary = validate_record(record) - record.content # make sure digests are checked - # XXX might need to read and digest the raw stream to check digests for chunked encoding? - # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes - except Exception: # pragma: no cover - # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code - print('Caught exception in warcio test analysis code') - traceback.print_exc() - exit(1) + record.content # make sure digests are checked + # XXX might need to read and digest the raw stream to check digests for chunked encoding? + # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes if commentary.has_comments() or record.digest_checker.passed is False: print(' ', 'WARC-Record-ID', commentary.record_id()) @@ -637,7 +637,7 @@ def _process_one(warc): if record.digest_checker.passed is True: print(' digest pass') elif record.digest_checker.passed is None: - if digest_present: + if digest_present: # pragma: no cover print(' digest present but not checked') else: print(' digest not present') @@ -659,8 +659,8 @@ def process_all(self): print(warc) try: self.process_one(warc) - except Exception as e: - print(' saw exception '+repr(e).rstrip(), file=sys.stderr) + except ArchiveLoadFailed as e: + print(' saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr) print(' skipping rest of file', file=sys.stderr) return self.exit_value From 69080d51a10aa6ad2f592447955cd858b0f9fe10 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 27 Jan 2019 23:57:53 -0800 Subject: [PATCH 46/68] branch coverage --- .../data/standard-torture-validate-field.warc | 1 + .../standard-torture-validate-record.warc | 26 +++++++++++ test/test_tests.py | 44 ++++++++++++++++++- warcio/tester.py | 1 + 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc index 2c28d72d..c88d3ee6 100644 --- a/test/data/standard-torture-validate-field.warc +++ b/test/data/standard-torture-validate-field.warc @@ -39,6 +39,7 @@ Content-Length: 0 WARC/1.1 WARC-Date: 2017-03-06T04:03:53Z WARC-Date: 2017-03-06T04:03:53.Z +WARC-Date: 2017-03-06T04:03:53.0Z WARC-Type: invalid Content-Length: 0 diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index 08a39e50..6f06205e 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -41,9 +41,23 @@ Content-Type: text/dns Content-Length: 0 +WARC/1.0 +WARC-Type: resource +WARC-Target-URI: foo:bar +Content-Length: 0 + + +WARC/1.0 +WARC-Type: request +WARC-Target-URI: hTtP://example.com/ +Content-Type: text/plain +Content-Length: 0 + + WARC/1.0 WARC-Type: request WARC-Target-URI: hTtP://example.com/ +WARC-IP-Address: 1.2.3.4 Content-Type: text/plain Content-Length: 0 @@ -54,6 +68,12 @@ Content-Type: application/warc-fields Content-Length: 0 +WARC/1.0 +WARC-Type: metadata +Content-Type: not-application/warc-fields +Content-Length: 0 + + WARC/1.0 WARC-Type: revisit WARC-Profile: none @@ -83,3 +103,9 @@ WARC-Segment-Number: 1 Content-Length: 0 +WARC/1.0 +WARC-Type: continuation +WARC-Segment-Number: 2 +Content-Length: 0 + + diff --git a/test/test_tests.py b/test/test_tests.py index 0fdecc74..174466c8 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -99,6 +99,12 @@ def test_torture_validate_record(): digest not present error: missing required header WARC-Date error: missing required header WARC-Record-ID + WARC-Record-ID None + WARC-Type resource + digest not present + error: missing required header Content-Type + error: missing required header WARC-Date + error: missing required header WARC-Record-ID WARC-Record-ID None WARC-Type request digest not present @@ -106,12 +112,23 @@ def test_torture_validate_record(): error: missing required header WARC-Record-ID error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID None + WARC-Type request + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain WARC-Record-ID None WARC-Type metadata digest not present error: missing required header WARC-Date error: missing required header WARC-Record-ID comment: warc-fields body present but empty + WARC-Record-ID None + WARC-Type metadata + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID WARC-Record-ID None WARC-Type revisit digest not present @@ -157,6 +174,14 @@ def test_torture_validate_record(): error: missing required header WARC-Target-URI error: continuation record must have WARC-Segment-Number > 1, saw 1 comment: warcio test continuation code has not been tested, expect bugs + WARC-Record-ID None + WARC-Type continuation + digest not present + error: missing required header WARC-Date + error: missing required header WARC-Record-ID + error: missing required header WARC-Segment-Origin-ID + error: missing required header WARC-Target-URI + comment: warcio test continuation code has not been tested, expect bugs """ value = helper(args, 0) @@ -238,6 +263,7 @@ def test_torture_validate_field(): digest not present error: duplicate field seen warc-date 2017-03-06T04:03:53.Z error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z + error: duplicate field seen warc-date 2017-03-06T04:03:53.0Z comment: unknown WARC-Type warc-type invalid WARC-Record-ID None WARC-Type request @@ -280,7 +306,7 @@ def test_arc(): def test_digests(): # needed for test coverage - files = ['example-digest-bad.warc'] + files = ['example-digest-bad.warc', 'example.warc'] files = [get_test_file(filename) for filename in files] args = ['test'] @@ -304,6 +330,21 @@ def test_digests(): WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests +test/data/example.warc + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type revisit + digest present but not checked + recommendation: missing recommended header WARC-Refers-To + comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com/ 1.0 + comment: field was introduced after this warc version WARC-Refers-To-Date 2017-03-06T04:02:06Z 1.0 + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests """ value = helper(args, 0) @@ -312,6 +353,7 @@ def test_digests(): def test_leftovers(): commentary = warcio.tester.Commentary('id', 'type') + assert not commentary.has_comments() # hard to test because invalid WARC Content-Length raises in archiveiterator warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None) diff --git a/warcio/tester.py b/warcio/tester.py index de9f3ca1..eaf7f09f 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -638,6 +638,7 @@ def _process_one(warc): print(' digest pass') elif record.digest_checker.passed is None: if digest_present: # pragma: no cover + # WARC record missing Content-Length: header, which is verboten print(' digest present but not checked') else: print(' digest not present') From 2e1d82012ad958f96ad30440d044207c9c187634 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 00:05:31 -0800 Subject: [PATCH 47/68] py2 branch coverage --- test/test_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_tests.py b/test/test_tests.py index 174466c8..98517308 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -191,6 +191,7 @@ def test_torture_validate_record(): if six.PY2: expected = expected.replace('\n error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n') + ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') assert ret == expected From bbdb57b4d37900ec0220f251860a06ead96093c1 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 10:09:17 -0800 Subject: [PATCH 48/68] py2 testing --- setup.py | 17 +++++++++++------ test/test_tests.py | 20 +++++--------------- warcio/tester.py | 26 ++++++++++++++++++++------ 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/setup.py b/setup.py index 0203bb64..f0390160 100755 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ from setuptools import setup, find_packages from setuptools.command.test import test as TestCommand import glob +import sys __version__ = '1.7.1' @@ -21,6 +22,15 @@ def run_tests(self): errcode = pytest.main(['--doctest-modules', './warcio', '--cov', 'warcio', '-v', 'test/']) sys.exit(errcode) +tests_require = [ + 'pytest', + 'pytest-cov', + 'httpbin==0.5.0', + 'requests', +] +if sys.version_info < (3, 3): + tests_require.append('ipaddress') + setup( name='warcio', version=__version__, @@ -44,12 +54,7 @@ def run_tests(self): """, cmdclass={'test': PyTest}, test_suite='', - tests_require=[ - 'pytest', - 'pytest-cov', - 'httpbin==0.5.0', - 'requests', - ], + tests_require=tests_require, classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Web Environment', diff --git a/test/test_tests.py b/test/test_tests.py index 98517308..dab1e669 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -187,13 +187,9 @@ def test_torture_validate_record(): value = helper(args, 0) print(remove_before_test_data(value)) - ret = remove_before_test_data(value) + actual = remove_before_test_data(value) - if six.PY2: - expected = expected.replace('\n error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n') - ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') - - assert ret == expected + assert actual == expected def test_torture_validate_field(): @@ -278,16 +274,10 @@ def test_torture_validate_field(): """ value = helper(args, 0) - ret = remove_before_test_data(value) - - if six.PY2: - if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret: - # user did not install ipaddress module - expected = expected.replace('\n error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n') - ret = ret.replace('\n comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n') + actual = remove_before_test_data(value) - print(ret) - assert ret == expected + print(actual) + assert actual == expected def test_arc(): diff --git a/warcio/tester.py b/warcio/tester.py index eaf7f09f..f00479ff 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -2,6 +2,7 @@ import re import sys +import six from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester @@ -68,11 +69,22 @@ def validate_warc_fields(record, commentary): # field-name = token # token_re content = record.content - try: - text = to_native_str(content, 'utf-8', errors='strict') - except UnicodeDecodeError as e: - commentary.error('warc-fields contains invalid utf-8: '+str(e)) - text = to_native_str(content, 'utf-8', errors='replace') + + if six.PY2: # pragma: no cover + try: + content.decode('utf-8', errors='strict') + text = content # already a str + except UnicodeDecodeError as e: + err = str(e) + err = err.replace('utf8', 'utf-8') # sigh + commentary.error('warc-fields contains invalid utf-8: '+err) + text = content.decode('utf-8', errors='replace') + else: # pragma: no cover + try: + text = to_native_str(content, 'utf-8', errors='strict') + except UnicodeDecodeError as e: + commentary.error('warc-fields contains invalid utf-8: '+str(e)) + text = to_native_str(content, 'utf-8', errors='replace') first_line = True lines = [] @@ -350,10 +362,12 @@ def validate_digest(field, value, record, version, commentary, pending): def validate_ip(field, value, record, version, commentary, pending): try: import ipaddress + if six.PY2: # pragma: no cover + value = unicode(value) ipaddress.ip_address(value) except ValueError: commentary.error('invalid ip', field, value) - except (ImportError, NameError): # pragma: no cover (for python 2.7) + except (ImportError, NameError): # pragma: no cover commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') From fc2d7b42549deada63ad1e65e81214a90fa75301 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 11:15:57 -0800 Subject: [PATCH 49/68] add record ids to test --- test/data/standard-torture-missing.warc | 5 - .../standard-torture-validate-record.warc | 25 +++++ test/test_tests.py | 91 ++++++++----------- warcio/tester.py | 2 +- 4 files changed, 63 insertions(+), 60 deletions(-) delete mode 100644 test/data/standard-torture-missing.warc diff --git a/test/data/standard-torture-missing.warc b/test/data/standard-torture-missing.warc deleted file mode 100644 index a1ab0714..00000000 --- a/test/data/standard-torture-missing.warc +++ /dev/null @@ -1,5 +0,0 @@ -WARC/1.0 -WARC-Type: warcinfo -Content-Length: 0 - - diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index 6f06205e..fa03b38e 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -15,13 +15,24 @@ token cannot have a space: WARC/1.0 +WARC-Record-ID: test-empty-warc-fields WARC-Type: warcinfo Content-Type: application/warc-fields Content-Length: 0 +WARC/1.0 +WARC-Type: warcinfo +WARC-Record-ID: test-warcinfo-non-recommended-content-type +Content-Type: not-application/warc-fields +Content-Length: 5 + +foo + + WARC/1.0 WARC-Type: response +WARC-Record-ID: test-response-content-type WARC-Target-URI: HtTp://example.com/ Content-Type: text/plain Content-Length: 0 @@ -29,6 +40,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource +WARC-Record-ID: test-resource-dns-content-type WARC-Target-URI: DnS:asdfasdf Content-Type: text/plain Content-Length: 0 @@ -36,6 +48,8 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource +WARC-Record-ID: test-resource-dns-empty +WARC-Test-TODO: add another with valid block WARC-Target-URI: DnS:asdfasdf Content-Type: text/dns Content-Length: 0 @@ -43,12 +57,14 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource +WARC-Record-ID: test-resource-not-dns WARC-Target-URI: foo:bar Content-Length: 0 WARC/1.0 WARC-Type: request +WARC-Record-ID: test-request-unrecommended-content-type WARC-Target-URI: hTtP://example.com/ Content-Type: text/plain Content-Length: 0 @@ -56,6 +72,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: request +WARC-Record-ID: test-request-unrecommended-content-type-with-ip WARC-Target-URI: hTtP://example.com/ WARC-IP-Address: 1.2.3.4 Content-Type: text/plain @@ -64,47 +81,55 @@ Content-Length: 0 WARC/1.0 WARC-Type: metadata +WARC-Record-ID: test-metadata-warc-fields-empty Content-Type: application/warc-fields Content-Length: 0 WARC/1.0 WARC-Type: metadata +WARC-Record-ID: test-metadata-not-warc-fields Content-Type: not-application/warc-fields Content-Length: 0 WARC/1.0 WARC-Type: revisit +WARC-Record-ID: test-revisit-profile-unknown WARC-Profile: none Content-Length: 0 WARC/1.0 WARC-Type: revisit +WARC-Record-ID: test-revisit-profile-future WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest Content-Length: 0 WARC/1.0 WARC-Type: revisit +WARC-Record-ID: test-revisit-profile-good WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified Content-Length: 0 WARC/1.0 WARC-Type: conversion +WARC-Record-ID: test-conversion Content-Length: 0 WARC/1.0 WARC-Type: continuation +WARC-Record-ID: test-continuation-segment-1 WARC-Segment-Number: 1 Content-Length: 0 WARC/1.0 WARC-Type: continuation +WARC-Record-ID: test-continuation-segment-valid WARC-Segment-Number: 2 Content-Length: 0 diff --git a/test/test_tests.py b/test/test_tests.py index dab1e669..723b2bd9 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -1,5 +1,3 @@ -import six - from warcio.cli import main from warcio.utils import to_native_str import warcio.tester @@ -32,28 +30,6 @@ def remove_before_test_data(s): return ret -def test_torture_missing(): - files = ['standard-torture-missing.warc'] - files = [get_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/standard-torture-missing.warc - WARC-Record-ID None - WARC-Type warcinfo - digest not present - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Record-ID - recommendation: warcinfo Content-Type of application/warc-fields, saw none -""" - - value = helper(args, 0) - assert remove_before_test_data(value) == expected - - def test_torture_validate_record(): files = ['standard-torture-validate-record.warc'] files = [get_test_file(filename) for filename in files] @@ -75,110 +51,117 @@ def test_torture_validate_record(): comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n comment: Missing field-name : in warc-fields line: no colon comment: invalid warc-fields name: token cannot have a space - WARC-Record-ID None + WARC-Record-ID test-empty-warc-fields WARC-Type warcinfo digest not present + error: uri must be within <> warc-record-id test-empty-warc-fields error: missing required header WARC-Date - error: missing required header WARC-Record-ID comment: warc-fields body present but empty - WARC-Record-ID None + WARC-Record-ID test-warcinfo-non-recommended-content-type + WARC-Type warcinfo + digest not present + error: uri must be within <> warc-record-id test-warcinfo-non-recommended-content-type + error: missing required header WARC-Date + recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields + WARC-Record-ID test-response-content-type WARC-Type response digest not present + error: uri must be within <> warc-record-id test-response-content-type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain error: WARC-IP-Address should be used for http and https responses - WARC-Record-ID None + WARC-Record-ID test-resource-dns-content-type WARC-Type resource digest not present + error: uri must be within <> warc-record-id test-resource-dns-content-type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: recource records for dns: shall have Content-Type of text/dns, saw text/plain - WARC-Record-ID None + WARC-Record-ID test-resource-dns-empty WARC-Type resource digest not present + error: uri must be within <> warc-record-id test-resource-dns-empty error: missing required header WARC-Date - error: missing required header WARC-Record-ID - WARC-Record-ID None + comment: unknown field, no validation performed WARC-Test-TODO add another with valid block + WARC-Record-ID test-resource-not-dns WARC-Type resource digest not present + error: uri must be within <> warc-record-id test-resource-not-dns error: missing required header Content-Type error: missing required header WARC-Date - error: missing required header WARC-Record-ID - WARC-Record-ID None + WARC-Record-ID test-request-unrecommended-content-type WARC-Type request digest not present + error: uri must be within <> warc-record-id test-request-unrecommended-content-type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain error: WARC-IP-Address should be used for http and https requests - WARC-Record-ID None + WARC-Record-ID test-request-unrecommended-content-type-with-ip WARC-Type request digest not present + error: uri must be within <> warc-record-id test-request-unrecommended-content-type-with-ip error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain - WARC-Record-ID None + WARC-Record-ID test-metadata-warc-fields-empty WARC-Type metadata digest not present + error: uri must be within <> warc-record-id test-metadata-warc-fields-empty error: missing required header WARC-Date - error: missing required header WARC-Record-ID comment: warc-fields body present but empty - WARC-Record-ID None + WARC-Record-ID test-metadata-not-warc-fields WARC-Type metadata digest not present + error: uri must be within <> warc-record-id test-metadata-not-warc-fields error: missing required header WARC-Date - error: missing required header WARC-Record-ID - WARC-Record-ID None + WARC-Record-ID test-revisit-profile-unknown WARC-Type revisit digest not present + error: uri must be within <> warc-record-id test-revisit-profile-unknown error: missing required header Content-Type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Target-URI comment: extension seen warc-profile none comment: no revisit details validation done due to unknown profile - WARC-Record-ID None + WARC-Record-ID test-revisit-profile-future WARC-Type revisit digest not present + error: uri must be within <> warc-record-id test-revisit-profile-future error: missing required header Content-Type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Target-URI error: missing required header WARC-Payload-Digest recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date recommendation: missing recommended header WARC-Refers-To-Target-URI comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest - WARC-Record-ID None + WARC-Record-ID test-revisit-profile-good WARC-Type revisit digest not present + error: uri must be within <> warc-record-id test-revisit-profile-good error: missing required header Content-Type error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Target-URI recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date - WARC-Record-ID None + WARC-Record-ID test-conversion WARC-Type conversion digest not present + error: uri must be within <> warc-record-id test-conversion error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Target-URI - WARC-Record-ID None + WARC-Record-ID test-continuation-segment-1 WARC-Type continuation digest not present + error: uri must be within <> warc-record-id test-continuation-segment-1 error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Segment-Origin-ID error: missing required header WARC-Target-URI error: continuation record must have WARC-Segment-Number > 1, saw 1 comment: warcio test continuation code has not been tested, expect bugs - WARC-Record-ID None + WARC-Record-ID test-continuation-segment-valid WARC-Type continuation digest not present + error: uri must be within <> warc-record-id test-continuation-segment-valid error: missing required header WARC-Date - error: missing required header WARC-Record-ID error: missing required header WARC-Segment-Origin-ID error: missing required header WARC-Target-URI comment: warcio test continuation code has not been tested, expect bugs diff --git a/warcio/tester.py b/warcio/tester.py index f00479ff..e9755c8c 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -125,7 +125,7 @@ def validate_warc_fields(record, commentary): def validate_warcinfo(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': - commentary.recommendation('warcinfo Content-Type of application/warc-fields, saw', content_type) + commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw', content_type) else: # format: warc-fields # allowable fields include but not limited to DMCI plus the following From d1fe18edb4220acb53b5304fde7e321679c7c42d Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 12:19:35 -0800 Subject: [PATCH 50/68] preserve capitalization in messages --- test/test_tests.py | 142 ++++++++++++++++++++++----------------------- warcio/tester.py | 15 +++-- 2 files changed, 78 insertions(+), 79 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index 723b2bd9..c922eff1 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -42,7 +42,7 @@ def test_torture_validate_record(): WARC-Record-ID None WARC-Type warcinfo digest not present - error: uri must be within <> warc-refers-to probhibited + error: uri must be within <> WARC-Refers-To probhibited error: missing required header WARC-Date error: missing required header WARC-Record-ID error: field not allowed in record_type WARC-Refers-To warcinfo @@ -54,77 +54,77 @@ def test_torture_validate_record(): WARC-Record-ID test-empty-warc-fields WARC-Type warcinfo digest not present - error: uri must be within <> warc-record-id test-empty-warc-fields + error: uri must be within <> WARC-Record-ID test-empty-warc-fields error: missing required header WARC-Date comment: warc-fields body present but empty WARC-Record-ID test-warcinfo-non-recommended-content-type WARC-Type warcinfo digest not present - error: uri must be within <> warc-record-id test-warcinfo-non-recommended-content-type + error: uri must be within <> WARC-Record-ID test-warcinfo-non-recommended-content-type error: missing required header WARC-Date recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields WARC-Record-ID test-response-content-type WARC-Type response digest not present - error: uri must be within <> warc-record-id test-response-content-type + error: uri must be within <> WARC-Record-ID test-response-content-type error: missing required header WARC-Date error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain error: WARC-IP-Address should be used for http and https responses WARC-Record-ID test-resource-dns-content-type WARC-Type resource digest not present - error: uri must be within <> warc-record-id test-resource-dns-content-type + error: uri must be within <> WARC-Record-ID test-resource-dns-content-type error: missing required header WARC-Date error: recource records for dns: shall have Content-Type of text/dns, saw text/plain WARC-Record-ID test-resource-dns-empty WARC-Type resource digest not present - error: uri must be within <> warc-record-id test-resource-dns-empty + error: uri must be within <> WARC-Record-ID test-resource-dns-empty error: missing required header WARC-Date comment: unknown field, no validation performed WARC-Test-TODO add another with valid block WARC-Record-ID test-resource-not-dns WARC-Type resource digest not present - error: uri must be within <> warc-record-id test-resource-not-dns + error: uri must be within <> WARC-Record-ID test-resource-not-dns error: missing required header Content-Type error: missing required header WARC-Date WARC-Record-ID test-request-unrecommended-content-type WARC-Type request digest not present - error: uri must be within <> warc-record-id test-request-unrecommended-content-type + error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type error: missing required header WARC-Date error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain error: WARC-IP-Address should be used for http and https requests WARC-Record-ID test-request-unrecommended-content-type-with-ip WARC-Type request digest not present - error: uri must be within <> warc-record-id test-request-unrecommended-content-type-with-ip + error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type-with-ip error: missing required header WARC-Date error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain WARC-Record-ID test-metadata-warc-fields-empty WARC-Type metadata digest not present - error: uri must be within <> warc-record-id test-metadata-warc-fields-empty + error: uri must be within <> WARC-Record-ID test-metadata-warc-fields-empty error: missing required header WARC-Date comment: warc-fields body present but empty WARC-Record-ID test-metadata-not-warc-fields WARC-Type metadata digest not present - error: uri must be within <> warc-record-id test-metadata-not-warc-fields + error: uri must be within <> WARC-Record-ID test-metadata-not-warc-fields error: missing required header WARC-Date WARC-Record-ID test-revisit-profile-unknown WARC-Type revisit digest not present - error: uri must be within <> warc-record-id test-revisit-profile-unknown + error: uri must be within <> WARC-Record-ID test-revisit-profile-unknown error: missing required header Content-Type error: missing required header WARC-Date error: missing required header WARC-Target-URI - comment: extension seen warc-profile none + comment: extension seen WARC-Profile none comment: no revisit details validation done due to unknown profile WARC-Record-ID test-revisit-profile-future WARC-Type revisit digest not present - error: uri must be within <> warc-record-id test-revisit-profile-future + error: uri must be within <> WARC-Record-ID test-revisit-profile-future error: missing required header Content-Type error: missing required header WARC-Date error: missing required header WARC-Target-URI @@ -132,11 +132,11 @@ def test_torture_validate_record(): recommendation: missing recommended header WARC-Refers-To recommendation: missing recommended header WARC-Refers-To-Date recommendation: missing recommended header WARC-Refers-To-Target-URI - comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + comment: extension seen WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest WARC-Record-ID test-revisit-profile-good WARC-Type revisit digest not present - error: uri must be within <> warc-record-id test-revisit-profile-good + error: uri must be within <> WARC-Record-ID test-revisit-profile-good error: missing required header Content-Type error: missing required header WARC-Date error: missing required header WARC-Target-URI @@ -145,13 +145,13 @@ def test_torture_validate_record(): WARC-Record-ID test-conversion WARC-Type conversion digest not present - error: uri must be within <> warc-record-id test-conversion + error: uri must be within <> WARC-Record-ID test-conversion error: missing required header WARC-Date error: missing required header WARC-Target-URI WARC-Record-ID test-continuation-segment-1 WARC-Type continuation digest not present - error: uri must be within <> warc-record-id test-continuation-segment-1 + error: uri must be within <> WARC-Record-ID test-continuation-segment-1 error: missing required header WARC-Date error: missing required header WARC-Segment-Origin-ID error: missing required header WARC-Target-URI @@ -160,7 +160,7 @@ def test_torture_validate_record(): WARC-Record-ID test-continuation-segment-valid WARC-Type continuation digest not present - error: uri must be within <> warc-record-id test-continuation-segment-valid + error: uri must be within <> WARC-Record-ID test-continuation-segment-valid error: missing required header WARC-Date error: missing required header WARC-Segment-Origin-ID error: missing required header WARC-Target-URI @@ -187,64 +187,64 @@ def test_torture_validate_field(): WARC-Record-ID WARC-Type does-not-exist unknown hash algorithm name in block digest - error: uri must not be within <> warc-target-uri - error: invalid uri scheme, bad character warc-target-uri - error: duplicate field seen warc-target-uri example.com - error: invalid uri, no scheme warc-target-uri example.com - error: duplicate field seen warc-target-uri ex ample.com - error: invalid uri, no scheme warc-target-uri ex ample.com - error: invalid uri, contains whitespace warc-target-uri ex ample.com - error: invalid uri scheme, bad character warc-target-uri ex ample.com - error: duplicate field seen warc-target-uri h<>ttp://example.com/ - error: invalid uri scheme, bad character warc-target-uri h<>ttp://example.com/ - error: duplicate field seen warc-type CAPITALIZED - error: uri must be within <> warc-concurrent-to http://example.com/ - error: duplicate field seen warc-date 2017-03-06T04:03:53.Z - error: WARC 1.0 may not have fractional seconds warc-date 2017-03-06T04:03:53.Z - error: must contain a / content-type asdf - error: invalid subtype content-type asdf - error: duplicate field seen content-type has space/asdf - error: invalid type content-type has space/asdf - error: duplicate field seen content-type asdf/has space - error: invalid subtype content-type asdf/has space - error: duplicate field seen content-type asdf/has space;asdf - error: invalid subtype content-type asdf/has space;asdf - error: missing algorithm warc-block-digest asdf - error: duplicate field seen warc-block-digest has space:asdf - error: invalid algorithm warc-block-digest has space:asdf - error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^ - error: invalid ip warc-ip-address 1.2.3.4.5 - error: uri must be within <> warc-warcinfo-id asdf:asdf - error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest - error: must contain a / warc-identified-payload-type asdf - error: invalid subtype warc-identified-payload-type asdf - error: uri must be within <> warc-segment-origin-id http://example.com - error: must be an integer warc-segment-number not-an-integer - error: duplicate field seen warc-segment-number 0 - error: must be 1 or greater warc-segment-number 0 - error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 0 - error: duplicate field seen warc-segment-number 1 - error: duplicate field seen warc-segment-number 2 - error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 2 - error: duplicate field seen warc-segment-total-length not-an-integer - error: must be an integer warc-segment-total-length not-an-integer - comment: unknown WARC-Type warc-type does-not-exist - comment: WARC-Type is not lower-case warc-type CAPITALIZED - comment: unknown WARC-Type warc-type CAPITALIZED - comment: unknown digest algorithm warc-block-digest asdf - comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^ - comment: extension seen warc-truncated invalid - comment: extension seen warc-profile asdf + error: uri must not be within <> WARC-Target-URI + error: invalid uri scheme, bad character WARC-Target-URI + error: duplicate field seen WARC-Target-URI example.com + error: invalid uri, no scheme WARC-Target-URI example.com + error: duplicate field seen WARC-Target-URI ex ample.com + error: invalid uri, no scheme WARC-Target-URI ex ample.com + error: invalid uri, contains whitespace WARC-Target-URI ex ample.com + error: invalid uri scheme, bad character WARC-Target-URI ex ample.com + error: duplicate field seen WARC-Target-URI h<>ttp://example.com/ + error: invalid uri scheme, bad character WARC-Target-URI h<>ttp://example.com/ + error: duplicate field seen WARC-Type CAPITALIZED + error: uri must be within <> WARC-Concurrent-To http://example.com/ + error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z + error: WARC 1.0 may not have fractional seconds WARC-Date 2017-03-06T04:03:53.Z + error: must contain a / Content-Type asdf + error: invalid subtype Content-Type asdf + error: duplicate field seen Content-Type has space/asdf + error: invalid type Content-Type has space/asdf + error: duplicate field seen Content-Type asdf/has space + error: invalid subtype Content-Type asdf/has space + error: duplicate field seen Content-Type asdf/has space;asdf + error: invalid subtype Content-Type asdf/has space;asdf + error: missing algorithm WARC-Block-Digest asdf + error: duplicate field seen WARC-Block-Digest has space:asdf + error: invalid algorithm WARC-Block-Digest has space:asdf + error: duplicate field seen WARC-Block-Digest sha1:&$*^&*^#*&^ + error: invalid ip WARC-IP-Address 1.2.3.4.5 + error: uri must be within <> WARC-Warcinfo-ID asdf:asdf + error: duplicate field seen WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: must contain a / WARC-Identified-Payload-Type asdf + error: invalid subtype WARC-Identified-Payload-Type asdf + error: uri must be within <> WARC-Segment-Origin-ID http://example.com + error: must be an integer WARC-Segment-Number not-an-integer + error: duplicate field seen WARC-Segment-Number 0 + error: must be 1 or greater WARC-Segment-Number 0 + error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 0 + error: duplicate field seen WARC-Segment-Number 1 + error: duplicate field seen WARC-Segment-Number 2 + error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 2 + error: duplicate field seen WARC-Segment-Total-Length not-an-integer + error: must be an integer WARC-Segment-Total-Length not-an-integer + comment: unknown WARC-Type WARC-Type does-not-exist + comment: WARC-Type is not lower-case WARC-Type CAPITALIZED + comment: unknown WARC-Type WARC-Type CAPITALIZED + comment: unknown digest algorithm WARC-Block-Digest asdf + comment: Invalid-looking digest value WARC-Block-Digest sha1:&$*^&*^#*&^ + comment: extension seen WARC-Truncated invalid + comment: extension seen WARC-Profile asdf comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0 comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0 comment: unknown field, no validation performed WARC-Unknown-Field asdf WARC-Record-ID None WARC-Type invalid digest not present - error: duplicate field seen warc-date 2017-03-06T04:03:53.Z - error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z - error: duplicate field seen warc-date 2017-03-06T04:03:53.0Z - comment: unknown WARC-Type warc-type invalid + error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z + error: fractional seconds must have 1-9 digits WARC-Date 2017-03-06T04:03:53.Z + error: duplicate field seen WARC-Date 2017-03-06T04:03:53.0Z + comment: unknown WARC-Type WARC-Type invalid WARC-Record-ID None WARC-Type request digest not present diff --git a/warcio/tester.py b/warcio/tester.py index e9755c8c..2300d062 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -603,19 +603,18 @@ def validate_record(record): seen_fields = set() for field, value in record.rec_headers.headers: - field_case = field - field = field.lower() - if field != 'warc-concurrent-to' and field in seen_fields: + field_l = field.lower() + if field != 'warc-concurrent-to' and field_l in seen_fields: commentary.error('duplicate field seen', field, value) - seen_fields.add(field) - if field not in warc_fields: - commentary.comment('unknown field, no validation performed', field_case, value) + seen_fields.add(field_l) + if field_l not in warc_fields: + commentary.comment('unknown field, no validation performed', field, value) continue - config = warc_fields[field] + config = warc_fields[field_l] if 'minver' in config: if version < config['minver']: # unknown fields are extensions, so this is a comment and not an error - commentary.comment('field was introduced after this warc version', field_case, value, version) + commentary.comment('field was introduced after this warc version', field, value, version) if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) From 484da9c4cad5f058cec41f68d46b7e31cfd74fef Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 12:55:48 -0800 Subject: [PATCH 51/68] capitals and colons --- test/test_tests.py | 264 ++++++++++++++++++++++----------------------- warcio/tester.py | 88 +++++++-------- 2 files changed, 177 insertions(+), 175 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index c922eff1..91eba656 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -42,128 +42,128 @@ def test_torture_validate_record(): WARC-Record-ID None WARC-Type warcinfo digest not present - error: uri must be within <> WARC-Refers-To probhibited - error: missing required header WARC-Date - error: missing required header WARC-Record-ID - error: field not allowed in record_type WARC-Refers-To warcinfo + error: uri must be within <>: WARC-Refers-To probhibited + error: missing required header: WARC-Date + error: missing required header: WARC-Record-ID + error: field not allowed in record type: warcinfo WARC-Refers-To error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte comment: The first line of warc-fields cannot start with whitespace comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n - comment: Missing field-name : in warc-fields line: no colon - comment: invalid warc-fields name: token cannot have a space + comment: Missing colon in warc-fields line: no colon + comment: Invalid warc-fields name: token cannot have a space WARC-Record-ID test-empty-warc-fields WARC-Type warcinfo digest not present - error: uri must be within <> WARC-Record-ID test-empty-warc-fields - error: missing required header WARC-Date + error: uri must be within <>: WARC-Record-ID test-empty-warc-fields + error: missing required header: WARC-Date comment: warc-fields body present but empty WARC-Record-ID test-warcinfo-non-recommended-content-type WARC-Type warcinfo digest not present - error: uri must be within <> WARC-Record-ID test-warcinfo-non-recommended-content-type - error: missing required header WARC-Date - recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields + error: uri must be within <>: WARC-Record-ID test-warcinfo-non-recommended-content-type + error: missing required header: WARC-Date + recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw: not-application/warc-fields WARC-Record-ID test-response-content-type WARC-Type response digest not present - error: uri must be within <> WARC-Record-ID test-response-content-type - error: missing required header WARC-Date - error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain + error: uri must be within <>: WARC-Record-ID test-response-content-type + error: missing required header: WARC-Date + error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw: text/plain error: WARC-IP-Address should be used for http and https responses WARC-Record-ID test-resource-dns-content-type WARC-Type resource digest not present - error: uri must be within <> WARC-Record-ID test-resource-dns-content-type - error: missing required header WARC-Date - error: recource records for dns: shall have Content-Type of text/dns, saw text/plain + error: uri must be within <>: WARC-Record-ID test-resource-dns-content-type + error: missing required header: WARC-Date + error: recource records for dns: shall have Content-Type of text/dns, saw: text/plain WARC-Record-ID test-resource-dns-empty WARC-Type resource digest not present - error: uri must be within <> WARC-Record-ID test-resource-dns-empty - error: missing required header WARC-Date - comment: unknown field, no validation performed WARC-Test-TODO add another with valid block + error: uri must be within <>: WARC-Record-ID test-resource-dns-empty + error: missing required header: WARC-Date + comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block WARC-Record-ID test-resource-not-dns WARC-Type resource digest not present - error: uri must be within <> WARC-Record-ID test-resource-not-dns - error: missing required header Content-Type - error: missing required header WARC-Date + error: uri must be within <>: WARC-Record-ID test-resource-not-dns + error: missing required header: Content-Type + error: missing required header: WARC-Date WARC-Record-ID test-request-unrecommended-content-type WARC-Type request digest not present - error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type - error: missing required header WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain + error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type + error: missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain error: WARC-IP-Address should be used for http and https requests WARC-Record-ID test-request-unrecommended-content-type-with-ip WARC-Type request digest not present - error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type-with-ip - error: missing required header WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain + error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type-with-ip + error: missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain WARC-Record-ID test-metadata-warc-fields-empty WARC-Type metadata digest not present - error: uri must be within <> WARC-Record-ID test-metadata-warc-fields-empty - error: missing required header WARC-Date + error: uri must be within <>: WARC-Record-ID test-metadata-warc-fields-empty + error: missing required header: WARC-Date comment: warc-fields body present but empty WARC-Record-ID test-metadata-not-warc-fields WARC-Type metadata digest not present - error: uri must be within <> WARC-Record-ID test-metadata-not-warc-fields - error: missing required header WARC-Date + error: uri must be within <>: WARC-Record-ID test-metadata-not-warc-fields + error: missing required header: WARC-Date WARC-Record-ID test-revisit-profile-unknown WARC-Type revisit digest not present - error: uri must be within <> WARC-Record-ID test-revisit-profile-unknown - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Target-URI - comment: extension seen WARC-Profile none - comment: no revisit details validation done due to unknown profile + error: uri must be within <>: WARC-Record-ID test-revisit-profile-unknown + error: missing required header: Content-Type + error: missing required header: WARC-Date + error: missing required header: WARC-Target-URI + comment: extension seen: WARC-Profile none + comment: no revisit details validation done due to unknown profile: none WARC-Record-ID test-revisit-profile-future WARC-Type revisit digest not present - error: uri must be within <> WARC-Record-ID test-revisit-profile-future - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Target-URI - error: missing required header WARC-Payload-Digest - recommendation: missing recommended header WARC-Refers-To - recommendation: missing recommended header WARC-Refers-To-Date - recommendation: missing recommended header WARC-Refers-To-Target-URI - comment: extension seen WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + error: uri must be within <>: WARC-Record-ID test-revisit-profile-future + error: missing required header: Content-Type + error: missing required header: WARC-Date + error: missing required header: WARC-Target-URI + error: missing required header: WARC-Payload-Digest + recommendation: missing recommended header: WARC-Refers-To + recommendation: missing recommended header: WARC-Refers-To-Date + recommendation: missing recommended header: WARC-Refers-To-Target-URI + comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest WARC-Record-ID test-revisit-profile-good WARC-Type revisit digest not present - error: uri must be within <> WARC-Record-ID test-revisit-profile-good - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Target-URI - recommendation: missing recommended header WARC-Refers-To - recommendation: missing recommended header WARC-Refers-To-Date + error: uri must be within <>: WARC-Record-ID test-revisit-profile-good + error: missing required header: Content-Type + error: missing required header: WARC-Date + error: missing required header: WARC-Target-URI + recommendation: missing recommended header: WARC-Refers-To + recommendation: missing recommended header: WARC-Refers-To-Date WARC-Record-ID test-conversion WARC-Type conversion digest not present - error: uri must be within <> WARC-Record-ID test-conversion - error: missing required header WARC-Date - error: missing required header WARC-Target-URI + error: uri must be within <>: WARC-Record-ID test-conversion + error: missing required header: WARC-Date + error: missing required header: WARC-Target-URI WARC-Record-ID test-continuation-segment-1 WARC-Type continuation digest not present - error: uri must be within <> WARC-Record-ID test-continuation-segment-1 - error: missing required header WARC-Date - error: missing required header WARC-Segment-Origin-ID - error: missing required header WARC-Target-URI - error: continuation record must have WARC-Segment-Number > 1, saw 1 + error: uri must be within <>: WARC-Record-ID test-continuation-segment-1 + error: missing required header: WARC-Date + error: missing required header: WARC-Segment-Origin-ID + error: missing required header: WARC-Target-URI + error: continuation record must have WARC-Segment-Number > 1, saw: 1 comment: warcio test continuation code has not been tested, expect bugs WARC-Record-ID test-continuation-segment-valid WARC-Type continuation digest not present - error: uri must be within <> WARC-Record-ID test-continuation-segment-valid - error: missing required header WARC-Date - error: missing required header WARC-Segment-Origin-ID - error: missing required header WARC-Target-URI + error: uri must be within <>: WARC-Record-ID test-continuation-segment-valid + error: missing required header: WARC-Date + error: missing required header: WARC-Segment-Origin-ID + error: missing required header: WARC-Target-URI comment: warcio test continuation code has not been tested, expect bugs """ @@ -187,73 +187,73 @@ def test_torture_validate_field(): WARC-Record-ID WARC-Type does-not-exist unknown hash algorithm name in block digest - error: uri must not be within <> WARC-Target-URI - error: invalid uri scheme, bad character WARC-Target-URI - error: duplicate field seen WARC-Target-URI example.com - error: invalid uri, no scheme WARC-Target-URI example.com - error: duplicate field seen WARC-Target-URI ex ample.com - error: invalid uri, no scheme WARC-Target-URI ex ample.com - error: invalid uri, contains whitespace WARC-Target-URI ex ample.com - error: invalid uri scheme, bad character WARC-Target-URI ex ample.com - error: duplicate field seen WARC-Target-URI h<>ttp://example.com/ - error: invalid uri scheme, bad character WARC-Target-URI h<>ttp://example.com/ - error: duplicate field seen WARC-Type CAPITALIZED - error: uri must be within <> WARC-Concurrent-To http://example.com/ - error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z - error: WARC 1.0 may not have fractional seconds WARC-Date 2017-03-06T04:03:53.Z - error: must contain a / Content-Type asdf - error: invalid subtype Content-Type asdf - error: duplicate field seen Content-Type has space/asdf - error: invalid type Content-Type has space/asdf - error: duplicate field seen Content-Type asdf/has space - error: invalid subtype Content-Type asdf/has space - error: duplicate field seen Content-Type asdf/has space;asdf - error: invalid subtype Content-Type asdf/has space;asdf - error: missing algorithm WARC-Block-Digest asdf - error: duplicate field seen WARC-Block-Digest has space:asdf - error: invalid algorithm WARC-Block-Digest has space:asdf - error: duplicate field seen WARC-Block-Digest sha1:&$*^&*^#*&^ - error: invalid ip WARC-IP-Address 1.2.3.4.5 - error: uri must be within <> WARC-Warcinfo-ID asdf:asdf - error: duplicate field seen WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest - error: must contain a / WARC-Identified-Payload-Type asdf - error: invalid subtype WARC-Identified-Payload-Type asdf - error: uri must be within <> WARC-Segment-Origin-ID http://example.com - error: must be an integer WARC-Segment-Number not-an-integer - error: duplicate field seen WARC-Segment-Number 0 - error: must be 1 or greater WARC-Segment-Number 0 - error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 0 - error: duplicate field seen WARC-Segment-Number 1 - error: duplicate field seen WARC-Segment-Number 2 - error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 2 - error: duplicate field seen WARC-Segment-Total-Length not-an-integer - error: must be an integer WARC-Segment-Total-Length not-an-integer - comment: unknown WARC-Type WARC-Type does-not-exist - comment: WARC-Type is not lower-case WARC-Type CAPITALIZED - comment: unknown WARC-Type WARC-Type CAPITALIZED - comment: unknown digest algorithm WARC-Block-Digest asdf - comment: Invalid-looking digest value WARC-Block-Digest sha1:&$*^&*^#*&^ - comment: extension seen WARC-Truncated invalid - comment: extension seen WARC-Profile asdf - comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0 - comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0 - comment: unknown field, no validation performed WARC-Unknown-Field asdf + error: uri must not be within <>: WARC-Target-URI + error: invalid uri scheme, bad character: WARC-Target-URI + error: duplicate field seen: WARC-Target-URI example.com + error: invalid uri, no scheme: WARC-Target-URI example.com + error: duplicate field seen: WARC-Target-URI ex ample.com + error: invalid uri, no scheme: WARC-Target-URI ex ample.com + error: invalid uri, contains whitespace: WARC-Target-URI ex ample.com + error: invalid uri scheme, bad character: WARC-Target-URI ex ample.com + error: duplicate field seen: WARC-Target-URI h<>ttp://example.com/ + error: invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/ + error: duplicate field seen: WARC-Type CAPITALIZED + error: uri must be within <>: WARC-Concurrent-To http://example.com/ + error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: WARC 1.0 time may not have fractional seconds: WARC-Date 2017-03-06T04:03:53.Z + error: must contain a /: Content-Type asdf + error: invalid subtype: Content-Type asdf + error: duplicate field seen: Content-Type has space/asdf + error: invalid type: Content-Type has space/asdf + error: duplicate field seen: Content-Type asdf/has space + error: invalid subtype: Content-Type asdf/has space + error: duplicate field seen: Content-Type asdf/has space;asdf + error: invalid subtype: Content-Type asdf/has space;asdf + error: missing algorithm: WARC-Block-Digest asdf + error: duplicate field seen: WARC-Block-Digest has space:asdf + error: invalid algorithm: WARC-Block-Digest has space:asdf + error: duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^ + error: invalid ip: WARC-IP-Address 1.2.3.4.5 + error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf + error: duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: must contain a /: WARC-Identified-Payload-Type asdf + error: invalid subtype: WARC-Identified-Payload-Type asdf + error: uri must be within <>: WARC-Segment-Origin-ID http://example.com + error: must be an integer: WARC-Segment-Number not-an-integer + error: duplicate field seen: WARC-Segment-Number 0 + error: must be 1 or greater: WARC-Segment-Number 0 + error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0 + error: duplicate field seen: WARC-Segment-Number 1 + error: duplicate field seen: WARC-Segment-Number 2 + error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 + error: duplicate field seen: WARC-Segment-Total-Length not-an-integer + error: must be an integer: WARC-Segment-Total-Length not-an-integer + comment: unknown WARC-Type: WARC-Type does-not-exist + comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED + comment: unknown WARC-Type: WARC-Type CAPITALIZED + comment: unknown digest algorithm: WARC-Block-Digest asdf + comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ + comment: extension seen: WARC-Truncated invalid + comment: extension seen: WARC-Profile asdf + comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com + comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date + comment: unknown field, no validation performed: WARC-Unknown-Field asdf WARC-Record-ID None WARC-Type invalid digest not present - error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z - error: fractional seconds must have 1-9 digits WARC-Date 2017-03-06T04:03:53.Z - error: duplicate field seen WARC-Date 2017-03-06T04:03:53.0Z - comment: unknown WARC-Type WARC-Type invalid + error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: fractional seconds must have 1-9 digits: WARC-Date 2017-03-06T04:03:53.Z + error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z + comment: unknown WARC-Type: WARC-Type invalid WARC-Record-ID None WARC-Type request digest not present - error: missing required header Content-Type - error: missing required header WARC-Date - error: missing required header WARC-Record-ID - error: missing required header WARC-Target-URI + error: missing required header: Content-Type + error: missing required header: WARC-Date + error: missing required header: WARC-Record-ID + error: missing required header: WARC-Target-URI recommendation: do not segment WARC-Type request - comment: no configuration seen for WARC-Segment-Number request + comment: Unknown field for this record type, perhaps an extension: request WARC-Segment-Number """ value = helper(args, 0) @@ -312,9 +312,9 @@ def test_digests(): WARC-Record-ID WARC-Type revisit digest present but not checked - recommendation: missing recommended header WARC-Refers-To - comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com/ 1.0 - comment: field was introduced after this warc version WARC-Refers-To-Date 2017-03-06T04:02:06Z 1.0 + recommendation: missing recommended header: WARC-Refers-To + comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ + comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z WARC-Record-ID WARC-Type request digest not present @@ -330,14 +330,14 @@ def test_leftovers(): assert not commentary.has_comments() # hard to test because invalid WARC Content-Length raises in archiveiterator - warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None) + warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) # hard to test because warcio checks the WARC version warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) expected = '''\ -error: must be an integer content-length not-an-integer -comment: no profile check because unknown warc version blah blah +error: must be an integer: Content-Length not-an-integer +comment: no profile check because unknown warc version: blah blah ''' assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/warcio/tester.py b/warcio/tester.py index 2300d062..4ee05f1f 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -59,7 +59,8 @@ def __getattr__(self, name): def canon_content_type(s): - return s.lower().replace('; ', ';') + # wget omits the space after the ;, let that pass + return s.lower().replace(';msgtype=', '; msgtype=') def validate_warc_fields(record, commentary): @@ -106,11 +107,11 @@ def validate_warc_fields(record, commentary): else: # check for field-name : if ':' not in line: - commentary.comment('Missing field-name : in warc-fields line:', line) + commentary.comment('Missing colon in warc-fields line:', line) else: field_name = line.split(':', 1)[0] if not re.search(token_re, field_name): - commentary.comment('invalid warc-fields name:', field_name) + commentary.comment('Invalid warc-fields name:', field_name) else: lines.append(line) first_line = False @@ -125,7 +126,7 @@ def validate_warc_fields(record, commentary): def validate_warcinfo(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': - commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw', content_type) + commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw:', content_type) else: # format: warc-fields # allowable fields include but not limited to DMCI plus the following @@ -145,8 +146,8 @@ def validate_response(record, commentary, pending): if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type', 'none') - if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}: - commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw', content_type) + if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}: + commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') @@ -163,7 +164,7 @@ def validate_resource(record, commentary, pending): if target_uri.startswith('dns:'): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'text/dns': - commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type) + commentary.error('recource records for dns: shall have Content-Type of text/dns, saw:', content_type) else: # rfc 2540 and rfc 1035 #validate_text_dns() @@ -178,8 +179,8 @@ def validate_request(record, commentary, pending): if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type') - if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}: - commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw', content_type) + if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}: + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https requests') @@ -225,7 +226,7 @@ def validate_revisit(record, commentary, pending): # if yes, should be like a response record, truncated if desired # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present else: - commentary.comment('no revisit details validation done due to unknown profile') + commentary.comment('no revisit details validation done due to unknown profile:', warc_profile) def validate_conversion(record, commentary, pending): @@ -239,7 +240,7 @@ def validate_continuation(record, commentary, pending): segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') if segment_number.isdigit() and int(segment_number) < 2: - commentary.error('continuation record must have WARC-Segment-Number > 1, saw', segment_number) + commentary.error('continuation record must have WARC-Segment-Number > 1, saw:', segment_number) # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated @@ -251,30 +252,30 @@ def validate_actual_uri(field, value, record, version, commentary, pending): # schemes are case-insensitive and normalize to lower if value.startswith('<') or value.endswith('>'): # wget 1.19 bug caused by WARC 1.0 spec error - commentary.error('uri must not be within <>', field, value) + commentary.error('uri must not be within <>:', field, value) if ':' not in value: - commentary.error('invalid uri, no scheme', field, value) + commentary.error('invalid uri, no scheme:', field, value) if re.search(r'\s', value): - commentary.error('invalid uri, contains whitespace', field, value) + commentary.error('invalid uri, contains whitespace:', field, value) scheme = value.split(':', 1)[0] if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): - commentary.error('invalid uri scheme, bad character', field, value) + commentary.error('invalid uri scheme, bad character:', field, value) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml def validate_warc_type(field, value, record, version, commentary, pending): if not value.islower(): # I am unclear if this is allowed? standard is silent - commentary.comment('WARC-Type is not lower-case', field, value) + commentary.comment('WARC-Type is not lower-case:', field, value) if value.lower() not in record_types: # standard says readers should ignore unknown warc-types - commentary.comment('unknown WARC-Type', field, value) + commentary.comment('unknown WARC-Type:', field, value) def validate_uri(field, value, record, version, commentary, pending): # < uri > if not (value.startswith('<') and value.endswith('>')): - commentary.error('uri must be within <>', field, value) + commentary.error('uri must be within <>:', field, value) return validate_actual_uri(field, value[1:-1], record, version, commentary, pending) @@ -289,12 +290,12 @@ def validate_timestamp(field, value, record, version, commentary, pending): if not use_ms: if '.' in value: # XXX specification infelicity: would be nice to have 'advice to implementers' here - commentary.error('WARC 1.0 may not have fractional seconds', field, value) + commentary.error('WARC 1.0 time may not have fractional seconds:', field, value) else: if '.' in value: start, end = value.split('.', 1) if not re.search(r'\A[0-9]{1,9}Z\Z', end): - commentary.error('fractional seconds must have 1-9 digits', field, value) + commentary.error('fractional seconds must have 1-9 digits:', field, value) # XXX the above is pretty incomplete for dash, colon, trailing Z, etc @@ -304,7 +305,7 @@ def validate_timestamp(field, value, record, version, commentary, pending): def validate_content_length(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer', field, value) + commentary.error('must be an integer:', field, value) token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z' @@ -313,7 +314,7 @@ def validate_content_length(field, value, record, version, commentary, pending): def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: - commentary.error('must contain a /', field, value) + commentary.error('must contain a /:', field, value) splits = value.split('/', 1) ctype = splits[0] if len(splits) > 1: @@ -321,13 +322,13 @@ def validate_content_type(field, value, record, version, commentary, pending): else: rest = '' if not re.search(token_re, ctype): - commentary.error('invalid type', field, value) + commentary.error('invalid type:', field, value) if ';' in rest: subtype, rest = rest.split(';', 1) else: subtype = rest if not re.search(token_re, subtype): - commentary.error('invalid subtype', field, value) + commentary.error('invalid subtype:', field, value) # at this point there can be multiple parameters, # some of which could have quoted string values with ; in them @@ -337,7 +338,7 @@ def validate_content_type(field, value, record, version, commentary, pending): def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: - commentary.error('missing algorithm', field, value) + commentary.error('missing algorithm:', field, value) splits = value.split(':', 1) algorithm = splits[0] if len(splits) > 1: @@ -345,18 +346,19 @@ def validate_digest(field, value, record, version, commentary, pending): else: digest = 'none' if not re.search(token_re, algorithm): - commentary.error('invalid algorithm', field, value) + commentary.error('invalid algorithm:', field, value) else: try: Digester(algorithm) except ValueError: - commentary.comment('unknown digest algorithm', field, value) + commentary.comment('unknown digest algorithm:', field, value) if not re.search(token_re, digest): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) pass if not re.search(digest_re, digest): - commentary.comment('Invalid-looking digest value', field, value) + # suggested in https://github.com/iipc/warc-specifications/issues/48 + commentary.comment('Invalid-looking digest value:', field, value) def validate_ip(field, value, record, version, commentary, pending): @@ -366,14 +368,14 @@ def validate_ip(field, value, record, version, commentary, pending): value = unicode(value) ipaddress.ip_address(value) except ValueError: - commentary.error('invalid ip', field, value) + commentary.error('invalid ip:', field, value) except (ImportError, NameError): # pragma: no cover commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') def validate_truncated(field, value, record, version, commentary, pending): if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: - commentary.comment('extension seen', field, value) + commentary.comment('extension seen:', field, value) def validate_warcinfo_id(field, value, record, version, commentary, pending): @@ -400,31 +402,31 @@ def validate_filename(field, value, record, version, commentary, pending): def validate_profile(field, value, record, version, commentary, pending): if version not in profiles: - commentary.comment('no profile check because unknown warc version', field, value) + commentary.comment('no profile check because unknown warc version:', field, value) return if value not in profiles[version]: - commentary.comment('extension seen', field, value) + commentary.comment('extension seen:', field, value) def validate_segment_number(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer', field, value) + commentary.error('must be an integer:', field, value) return iv = int(value) if iv == 0: - commentary.error('must be 1 or greater', field, value) + commentary.error('must be 1 or greater:', field, value) rec_type = record.rec_headers.get_header('WARC-Type', 'none') if rec_type != 'continuation': if iv != 1: - commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value) + commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value) if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: commentary.recommendation('do not segment WARC-Type', rec_type) def validate_segment_total_length(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer', field, value) + commentary.error('must be an integer:', field, value) warc_fields = { @@ -568,21 +570,21 @@ def make_header_set(config, kinds): def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): for req in sorted(config.get('required', [])): if not rec_headers.get_header(req): - commentary.error('missing required header', req) + commentary.error('missing required header:', req) for rec in sorted(config.get('recommended', [])): if not rec_headers.get_header(rec): - commentary.recommendation('missing recommended header', rec) + commentary.recommendation('missing recommended header:', rec) allowed = make_header_set(config, ('required', 'optional', 'recommended')) prohibited = make_header_set(config, ('prohibited',)) for field, value in rec_headers.headers: fl = field.lower() if fl in prohibited: - commentary.error('field not allowed in record_type', field, rec_type) + commentary.error('field not allowed in record type:', rec_type, field) elif allow_all or fl in allowed: pass elif fl in warc_fields: - commentary.comment('no configuration seen for', field, rec_type) + commentary.comment('Unknown field for this record type, perhaps an extension:', rec_type, field) else: # an 'unknown field' comment has already been issued in validate_record pass @@ -605,16 +607,16 @@ def validate_record(record): for field, value in record.rec_headers.headers: field_l = field.lower() if field != 'warc-concurrent-to' and field_l in seen_fields: - commentary.error('duplicate field seen', field, value) + commentary.error('duplicate field seen:', field, value) seen_fields.add(field_l) if field_l not in warc_fields: - commentary.comment('unknown field, no validation performed', field, value) + commentary.comment('unknown field, no validation performed:', field, value) continue config = warc_fields[field_l] if 'minver' in config: if version < config['minver']: # unknown fields are extensions, so this is a comment and not an error - commentary.comment('field was introduced after this warc version', field, value, version) + commentary.comment('field was introduced after this warc version:', version, field, value) if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) From 46874975664acf2ad8615511212df4edecb78d4b Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 13:15:30 -0800 Subject: [PATCH 52/68] use valid record ids --- .../data/standard-torture-validate-field.warc | 106 +++++++++--------- .../standard-torture-validate-record.warc | 32 +++--- test/test_tests.py | 62 ++++------ warcio/tester.py | 30 ++--- 4 files changed, 107 insertions(+), 123 deletions(-) diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc index c88d3ee6..816413be 100644 --- a/test/data/standard-torture-validate-field.warc +++ b/test/data/standard-torture-validate-field.warc @@ -1,53 +1,53 @@ -WARC/1.0 -WARC-Target-URI: -WARC-Target-URI: example.com -WARC-Target-URI: ex ample.com -WARC-Target-URI: h<>ttp://example.com/ -WARC-Type: does-not-exist -WARC-Type: CAPITALIZED -WARC-Concurrent-To: http://example.com/ -WARC-Record-ID: -WARC-Date: 2017-03-06T04:03:53Z -WARC-Date: 2017-03-06T04:03:53.Z -Content-Type: asdf -Content-Type: has space/asdf -Content-Type: asdf/has space -Content-Type: asdf/has space;asdf -WARC-Block-Digest: asdf -WARC-Block-Digest: has space:asdf -WARC-Block-Digest: sha1:&$*^&*^#*&^ -WARC-IP-Address: 1.2.3.4.5 -WARC-Truncated: invalid -WARC-Warcinfo-ID: asdf:asdf -WARC-Filename: not-yet-tested -WARC-Profile: asdf -WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest -WARC-Identified-Payload-Type: asdf -WARC-Segment-Origin-ID: http://example.com -WARC-Segment-Number: not-an-integer -WARC-Segment-Number: 0 -WARC-Segment-Number: 1 -WARC-Segment-Number: 2 -WARC-Segment-Total-Length: 0 -WARC-Segment-Total-Length: not-an-integer -WARC-Refers-To-Target-URI: http://example.com -WARC-Refers-To-Date: not-a-date -WARC-Unknown-Field: asdf -Content-Length: 0 - - -WARC/1.1 -WARC-Date: 2017-03-06T04:03:53Z -WARC-Date: 2017-03-06T04:03:53.Z -WARC-Date: 2017-03-06T04:03:53.0Z -WARC-Type: invalid -Content-Length: 0 - - -WARC/1.1 -WARC-Type: request -WARC-Segment-Number: 1 -Content-Length: 0 - - -WARC/invalid +WARC/1.0 +WARC-Target-URI: +WARC-Target-URI: example.com +WARC-Target-URI: ex ample.com +WARC-Target-URI: h<>ttp://example.com/ +WARC-Type: does-not-exist +WARC-Type: CAPITALIZED +WARC-Concurrent-To: http://example.com/ +WARC-Record-ID: +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +Content-Type: asdf +Content-Type: has space/asdf +Content-Type: asdf/has space +Content-Type: asdf/has space;asdf +WARC-Block-Digest: asdf +WARC-Block-Digest: has space:asdf +WARC-Block-Digest: sha1:&$*^&*^#*&^ +WARC-IP-Address: 1.2.3.4.5 +WARC-Truncated: invalid +WARC-Warcinfo-ID: asdf:asdf +WARC-Filename: not-yet-tested +WARC-Profile: asdf +WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest +WARC-Identified-Payload-Type: asdf +WARC-Segment-Origin-ID: http://example.com +WARC-Segment-Number: not-an-integer +WARC-Segment-Number: 0 +WARC-Segment-Number: 1 +WARC-Segment-Number: 2 +WARC-Segment-Total-Length: 0 +WARC-Segment-Total-Length: not-an-integer +WARC-Refers-To-Target-URI: http://example.com +WARC-Refers-To-Date: not-a-date +WARC-Unknown-Field: asdf +Content-Length: 0 + + +WARC/1.1 +WARC-Date: 2017-03-06T04:03:53Z +WARC-Date: 2017-03-06T04:03:53.Z +WARC-Date: 2017-03-06T04:03:53.0Z +WARC-Type: invalid +Content-Length: 0 + + +WARC/1.1 +WARC-Type: request +WARC-Segment-Number: 1 +Content-Length: 0 + + +WARC/invalid diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc index fa03b38e..da6a2aaf 100644 --- a/test/data/standard-torture-validate-record.warc +++ b/test/data/standard-torture-validate-record.warc @@ -15,7 +15,7 @@ token cannot have a space: WARC/1.0 -WARC-Record-ID: test-empty-warc-fields +WARC-Record-ID: WARC-Type: warcinfo Content-Type: application/warc-fields Content-Length: 0 @@ -23,7 +23,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: warcinfo -WARC-Record-ID: test-warcinfo-non-recommended-content-type +WARC-Record-ID: Content-Type: not-application/warc-fields Content-Length: 5 @@ -32,7 +32,7 @@ foo WARC/1.0 WARC-Type: response -WARC-Record-ID: test-response-content-type +WARC-Record-ID: WARC-Target-URI: HtTp://example.com/ Content-Type: text/plain Content-Length: 0 @@ -40,7 +40,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource -WARC-Record-ID: test-resource-dns-content-type +WARC-Record-ID: WARC-Target-URI: DnS:asdfasdf Content-Type: text/plain Content-Length: 0 @@ -48,7 +48,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource -WARC-Record-ID: test-resource-dns-empty +WARC-Record-ID: WARC-Test-TODO: add another with valid block WARC-Target-URI: DnS:asdfasdf Content-Type: text/dns @@ -57,14 +57,14 @@ Content-Length: 0 WARC/1.0 WARC-Type: resource -WARC-Record-ID: test-resource-not-dns +WARC-Record-ID: WARC-Target-URI: foo:bar Content-Length: 0 WARC/1.0 WARC-Type: request -WARC-Record-ID: test-request-unrecommended-content-type +WARC-Record-ID: WARC-Target-URI: hTtP://example.com/ Content-Type: text/plain Content-Length: 0 @@ -72,7 +72,7 @@ Content-Length: 0 WARC/1.0 WARC-Type: request -WARC-Record-ID: test-request-unrecommended-content-type-with-ip +WARC-Record-ID: WARC-Target-URI: hTtP://example.com/ WARC-IP-Address: 1.2.3.4 Content-Type: text/plain @@ -81,55 +81,55 @@ Content-Length: 0 WARC/1.0 WARC-Type: metadata -WARC-Record-ID: test-metadata-warc-fields-empty +WARC-Record-ID: Content-Type: application/warc-fields Content-Length: 0 WARC/1.0 WARC-Type: metadata -WARC-Record-ID: test-metadata-not-warc-fields +WARC-Record-ID: Content-Type: not-application/warc-fields Content-Length: 0 WARC/1.0 WARC-Type: revisit -WARC-Record-ID: test-revisit-profile-unknown +WARC-Record-ID: WARC-Profile: none Content-Length: 0 WARC/1.0 WARC-Type: revisit -WARC-Record-ID: test-revisit-profile-future +WARC-Record-ID: WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest Content-Length: 0 WARC/1.0 WARC-Type: revisit -WARC-Record-ID: test-revisit-profile-good +WARC-Record-ID: WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified Content-Length: 0 WARC/1.0 WARC-Type: conversion -WARC-Record-ID: test-conversion +WARC-Record-ID: Content-Length: 0 WARC/1.0 WARC-Type: continuation -WARC-Record-ID: test-continuation-segment-1 +WARC-Record-ID: WARC-Segment-Number: 1 Content-Length: 0 WARC/1.0 WARC-Type: continuation -WARC-Record-ID: test-continuation-segment-valid +WARC-Record-ID: WARC-Segment-Number: 2 Content-Length: 0 diff --git a/test/test_tests.py b/test/test_tests.py index 91eba656..c08a19f6 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -51,80 +51,68 @@ def test_torture_validate_record(): comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n comment: Missing colon in warc-fields line: no colon comment: Invalid warc-fields name: token cannot have a space - WARC-Record-ID test-empty-warc-fields + WARC-Record-ID WARC-Type warcinfo digest not present - error: uri must be within <>: WARC-Record-ID test-empty-warc-fields error: missing required header: WARC-Date comment: warc-fields body present but empty - WARC-Record-ID test-warcinfo-non-recommended-content-type + WARC-Record-ID WARC-Type warcinfo digest not present - error: uri must be within <>: WARC-Record-ID test-warcinfo-non-recommended-content-type error: missing required header: WARC-Date - recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw: not-application/warc-fields - WARC-Record-ID test-response-content-type + recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields + WARC-Record-ID WARC-Type response digest not present - error: uri must be within <>: WARC-Record-ID test-response-content-type error: missing required header: WARC-Date - error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw: text/plain + error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain error: WARC-IP-Address should be used for http and https responses - WARC-Record-ID test-resource-dns-content-type + WARC-Record-ID WARC-Type resource digest not present - error: uri must be within <>: WARC-Record-ID test-resource-dns-content-type error: missing required header: WARC-Date - error: recource records for dns: shall have Content-Type of text/dns, saw: text/plain - WARC-Record-ID test-resource-dns-empty + error: resource records for dns shall have Content-Type of text/dns: text/plain + WARC-Record-ID WARC-Type resource digest not present - error: uri must be within <>: WARC-Record-ID test-resource-dns-empty error: missing required header: WARC-Date comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block - WARC-Record-ID test-resource-not-dns + WARC-Record-ID WARC-Type resource digest not present - error: uri must be within <>: WARC-Record-ID test-resource-not-dns error: missing required header: Content-Type error: missing required header: WARC-Date - WARC-Record-ID test-request-unrecommended-content-type + WARC-Record-ID WARC-Type request digest not present - error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type error: missing required header: WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain error: WARC-IP-Address should be used for http and https requests - WARC-Record-ID test-request-unrecommended-content-type-with-ip + WARC-Record-ID WARC-Type request digest not present - error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type-with-ip error: missing required header: WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain - WARC-Record-ID test-metadata-warc-fields-empty + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain + WARC-Record-ID WARC-Type metadata digest not present - error: uri must be within <>: WARC-Record-ID test-metadata-warc-fields-empty error: missing required header: WARC-Date comment: warc-fields body present but empty - WARC-Record-ID test-metadata-not-warc-fields + WARC-Record-ID WARC-Type metadata digest not present - error: uri must be within <>: WARC-Record-ID test-metadata-not-warc-fields error: missing required header: WARC-Date - WARC-Record-ID test-revisit-profile-unknown + WARC-Record-ID WARC-Type revisit digest not present - error: uri must be within <>: WARC-Record-ID test-revisit-profile-unknown error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Target-URI comment: extension seen: WARC-Profile none comment: no revisit details validation done due to unknown profile: none - WARC-Record-ID test-revisit-profile-future + WARC-Record-ID WARC-Type revisit digest not present - error: uri must be within <>: WARC-Record-ID test-revisit-profile-future error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Target-URI @@ -133,34 +121,30 @@ def test_torture_validate_record(): recommendation: missing recommended header: WARC-Refers-To-Date recommendation: missing recommended header: WARC-Refers-To-Target-URI comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest - WARC-Record-ID test-revisit-profile-good + WARC-Record-ID WARC-Type revisit digest not present - error: uri must be within <>: WARC-Record-ID test-revisit-profile-good error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Target-URI recommendation: missing recommended header: WARC-Refers-To recommendation: missing recommended header: WARC-Refers-To-Date - WARC-Record-ID test-conversion + WARC-Record-ID WARC-Type conversion digest not present - error: uri must be within <>: WARC-Record-ID test-conversion error: missing required header: WARC-Date error: missing required header: WARC-Target-URI - WARC-Record-ID test-continuation-segment-1 + WARC-Record-ID WARC-Type continuation digest not present - error: uri must be within <>: WARC-Record-ID test-continuation-segment-1 error: missing required header: WARC-Date error: missing required header: WARC-Segment-Origin-ID error: missing required header: WARC-Target-URI - error: continuation record must have WARC-Segment-Number > 1, saw: 1 + error: continuation record must have WARC-Segment-Number > 1: 1 comment: warcio test continuation code has not been tested, expect bugs - WARC-Record-ID test-continuation-segment-valid + WARC-Record-ID WARC-Type continuation digest not present - error: uri must be within <>: WARC-Record-ID test-continuation-segment-valid error: missing required header: WARC-Date error: missing required header: WARC-Segment-Origin-ID error: missing required header: WARC-Target-URI @@ -184,7 +168,7 @@ def test_torture_validate_field(): expected = """\ test/data/standard-torture-validate-field.warc - WARC-Record-ID + WARC-Record-ID WARC-Type does-not-exist unknown hash algorithm name in block digest error: uri must not be within <>: WARC-Target-URI diff --git a/warcio/tester.py b/warcio/tester.py index 4ee05f1f..023cdb29 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -126,7 +126,7 @@ def validate_warc_fields(record, commentary): def validate_warcinfo(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': - commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw:', content_type) + commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type) else: # format: warc-fields # allowable fields include but not limited to DMCI plus the following @@ -147,7 +147,7 @@ def validate_response(record, commentary, pending): if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type', 'none') if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}: - commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw:', content_type) + commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') @@ -164,7 +164,7 @@ def validate_resource(record, commentary, pending): if target_uri.startswith('dns:'): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'text/dns': - commentary.error('recource records for dns: shall have Content-Type of text/dns, saw:', content_type) + commentary.error('resource records for dns shall have Content-Type of text/dns:', content_type) else: # rfc 2540 and rfc 1035 #validate_text_dns() @@ -180,7 +180,7 @@ def validate_request(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type') if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}: - commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw:', content_type) + commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https requests') @@ -240,12 +240,12 @@ def validate_continuation(record, commentary, pending): segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none') if segment_number.isdigit() and int(segment_number) < 2: - commentary.error('continuation record must have WARC-Segment-Number > 1, saw:', segment_number) + commentary.error('continuation record must have WARC-Segment-Number > 1:', segment_number) # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated -def validate_actual_uri(field, value, record, version, commentary, pending): +def validate_unbracketed_uri(field, value, record, version, commentary, pending): # uri per RFC 3986 # should use a registered scheme # %XX encoding, normalize to upper case @@ -272,16 +272,16 @@ def validate_warc_type(field, value, record, version, commentary, pending): commentary.comment('unknown WARC-Type:', field, value) -def validate_uri(field, value, record, version, commentary, pending): +def validate_bracketed_uri(field, value, record, version, commentary, pending): # < uri > if not (value.startswith('<') and value.endswith('>')): commentary.error('uri must be within <>:', field, value) return - validate_actual_uri(field, value[1:-1], record, version, commentary, pending) + validate_unbracketed_uri(field, value[1:-1], record, version, commentary, pending) def validate_record_id(field, value, record, version, commentary, pending): - validate_uri(field, value, record, version, commentary, pending) + validate_bracketed_uri(field, value, record, version, commentary, pending) # TODO: should be "globally unique for its period of intended use" @@ -379,7 +379,7 @@ def validate_truncated(field, value, record, version, commentary, pending): def validate_warcinfo_id(field, value, record, version, commentary, pending): - validate_uri(field, value, record, version, commentary, pending) + validate_bracketed_uri(field, value, record, version, commentary, pending) # TODO: should point at a warcinfo record @@ -446,7 +446,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_content_type, }, 'WARC-Concurrent-To': { - 'validate': validate_uri, + 'validate': validate_bracketed_uri, }, 'WARC-Block-Digest': { 'validate': validate_digest, @@ -458,10 +458,10 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_ip, }, 'WARC-Refers-To': { - 'validate': validate_uri, + 'validate': validate_bracketed_uri, }, 'WARC-Target-URI': { - 'validate': validate_actual_uri, + 'validate': validate_unbracketed_uri, }, 'WARC-Truncated': { 'validate': validate_truncated, @@ -479,7 +479,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_content_type, }, 'WARC-Segment-Origin-ID': { - 'validate': validate_uri, + 'validate': validate_bracketed_uri, }, 'WARC-Segment-Number': { 'validate': validate_segment_number, @@ -488,7 +488,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_segment_total_length, }, 'WARC-Refers-To-Target-URI': { - 'validate': validate_actual_uri, + 'validate': validate_unbracketed_uri, 'minver': '1.1', }, 'WARC-Refers-To-Date': { From bcfe672f26506bcf9b070834c2fb6842f5d1028c Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 13:31:40 -0800 Subject: [PATCH 53/68] warc-segment-number cleaner recommendation --- test/test_tests.py | 1 - warcio/tester.py | 12 ++++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index c08a19f6..dcbc3666 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -237,7 +237,6 @@ def test_torture_validate_field(): error: missing required header: WARC-Record-ID error: missing required header: WARC-Target-URI recommendation: do not segment WARC-Type request - comment: Unknown field for this record type, perhaps an extension: request WARC-Segment-Number """ value = helper(args, 0) diff --git a/warcio/tester.py b/warcio/tester.py index 023cdb29..6346754d 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -503,20 +503,21 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'], 'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'], + 'ignored': ['WARC-Segment-Number'], 'validate': validate_warcinfo, }, 'response': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type', 'WARC-Target-URI'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_response, }, 'resource': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_resource, }, @@ -526,6 +527,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'ignored': ['WARC-Segment-Number'], 'validate': validate_request, }, 'metadata': { @@ -534,6 +536,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'], 'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'ignored': ['WARC-Segment-Number'], 'validate': validate_metadata, }, 'revisit': { @@ -542,11 +545,12 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID', # normal optionals 'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], # these are for profiles 'prohibited': ['WARC-Filename'], + 'ignored': ['WARC-Segment-Number'], 'validate': validate_revisit, }, 'conversion': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], - 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number'], 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_conversion, }, @@ -574,7 +578,7 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, for rec in sorted(config.get('recommended', [])): if not rec_headers.get_header(rec): commentary.recommendation('missing recommended header:', rec) - allowed = make_header_set(config, ('required', 'optional', 'recommended')) + allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored')) prohibited = make_header_set(config, ('prohibited',)) for field, value in rec_headers.headers: From 7f715c055c25f2a8486555196dc683ba59a0220d Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 13:55:54 -0800 Subject: [PATCH 54/68] segment origin id --- test/test_tests.py | 1 + warcio/tester.py | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index dcbc3666..598ba49b 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -232,6 +232,7 @@ def test_torture_validate_field(): WARC-Record-ID None WARC-Type request digest not present + error: segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Record-ID diff --git a/warcio/tester.py b/warcio/tester.py index 6346754d..632de060 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -420,6 +420,9 @@ def validate_segment_number(field, value, record, version, commentary, pending): if rec_type != 'continuation': if iv != 1: commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value) + origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID') + if origin_id is None: + commentary.error('segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID') if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: commentary.recommendation('do not segment WARC-Type', rec_type) @@ -503,21 +506,21 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'], 'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'], - 'ignored': ['WARC-Segment-Number'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'validate': validate_warcinfo, }, 'response': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type', 'WARC-Target-URI'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_response, }, 'resource': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_resource, }, @@ -527,7 +530,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], - 'ignored': ['WARC-Segment-Number'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'validate': validate_request, }, 'metadata': { @@ -536,7 +539,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated', 'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'], 'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], - 'ignored': ['WARC-Segment-Number'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'validate': validate_metadata, }, 'revisit': { @@ -545,18 +548,18 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID', # normal optionals 'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], # these are for profiles 'prohibited': ['WARC-Filename'], - 'ignored': ['WARC-Segment-Number'], + 'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'validate': validate_revisit, }, 'conversion': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], - 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_conversion, }, 'continuation': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', - 'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'], + 'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'], 'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'], 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_continuation, @@ -587,8 +590,8 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, commentary.error('field not allowed in record type:', rec_type, field) elif allow_all or fl in allowed: pass - elif fl in warc_fields: - commentary.comment('Unknown field for this record type, perhaps an extension:', rec_type, field) + elif fl in warc_fields: # pragma: no cover (this is a configuration error, if it happens) + commentary.comment('Known field, but not expected for this record type:', rec_type, field) else: # an 'unknown field' comment has already been issued in validate_record pass From 2583f19c762037a66847e2a4087c16082a06fbfc Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 14:51:19 -0800 Subject: [PATCH 55/68] timestamp checking --- test/test_tests.py | 6 ++++-- warcio/tester.py | 21 ++++++++------------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index 598ba49b..89851eca 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -184,7 +184,8 @@ def test_torture_validate_field(): error: duplicate field seen: WARC-Type CAPITALIZED error: uri must be within <>: WARC-Concurrent-To http://example.com/ error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z - error: WARC 1.0 time may not have fractional seconds: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z + error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z error: must contain a /: Content-Type asdf error: invalid subtype: Content-Type asdf error: duplicate field seen: Content-Type has space/asdf @@ -212,6 +213,7 @@ def test_torture_validate_field(): error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 error: duplicate field seen: WARC-Segment-Total-Length not-an-integer error: must be an integer: WARC-Segment-Total-Length not-an-integer + error: Invalid timestamp: WARC-Refers-To-Date not-a-date comment: unknown WARC-Type: WARC-Type does-not-exist comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED comment: unknown WARC-Type: WARC-Type CAPITALIZED @@ -226,7 +228,7 @@ def test_torture_validate_field(): WARC-Type invalid digest not present error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z - error: fractional seconds must have 1-9 digits: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z comment: unknown WARC-Type: WARC-Type invalid WARC-Record-ID None diff --git a/warcio/tester.py b/warcio/tester.py index 632de060..5396ff3b 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -286,21 +286,16 @@ def validate_record_id(field, value, record, version, commentary, pending): def validate_timestamp(field, value, record, version, commentary, pending): - use_ms = False if version == '1.0' else True - if not use_ms: - if '.' in value: - # XXX specification infelicity: would be nice to have 'advice to implementers' here - commentary.error('WARC 1.0 time may not have fractional seconds:', field, value) - else: - if '.' in value: - start, end = value.split('.', 1) - if not re.search(r'\A[0-9]{1,9}Z\Z', end): - commentary.error('fractional seconds must have 1-9 digits:', field, value) + ISO_RE = r'\A\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:.\d{1,9})?Z\Z' - # XXX the above is pretty incomplete for dash, colon, trailing Z, etc + if not re.match(ISO_RE, value): + commentary.error('Invalid timestamp:', field, value) - # TODO: "multiple records written as part of a single capture event shall use the same WARC-Date" - # how? follow WARC-Concurrent-To pointer(s) from request to response(s) + use_ms = False if version <= '1.0' else True + if not use_ms: + if '.' in value: + # specification infelicity: would be nice to have 'advice to implementers' here + commentary.error('WARC versions <= 1.0 may not have timestamps with fractional seconds:', field, value) def validate_content_length(field, value, record, version, commentary, pending): From 8eb87e845f0b0a76cb0f3dd035bbdb52c19f6bc0 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 28 Jan 2019 16:46:59 -0800 Subject: [PATCH 56/68] buglet --- test/data/standard-torture-validate-field.warc | 1 + warcio/tester.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc index 816413be..126ba964 100644 --- a/test/data/standard-torture-validate-field.warc +++ b/test/data/standard-torture-validate-field.warc @@ -6,6 +6,7 @@ WARC-Target-URI: h<>ttp://example.com/ WARC-Type: does-not-exist WARC-Type: CAPITALIZED WARC-Concurrent-To: http://example.com/ +WARC-Concurrent-To: WARC-Record-ID: WARC-Date: 2017-03-06T04:03:53Z WARC-Date: 2017-03-06T04:03:53.Z diff --git a/warcio/tester.py b/warcio/tester.py index 5396ff3b..8e9d8da3 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -608,7 +608,7 @@ def validate_record(record): seen_fields = set() for field, value in record.rec_headers.headers: field_l = field.lower() - if field != 'warc-concurrent-to' and field_l in seen_fields: + if field_l != 'warc-concurrent-to' and field_l in seen_fields: commentary.error('duplicate field seen:', field, value) seen_fields.add(field_l) if field_l not in warc_fields: From 3a8747e04641e4e4030981dc36a7898b686060f9 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Tue, 29 Jan 2019 17:52:05 -0800 Subject: [PATCH 57/68] global checks --- .../data/standard-torture-validate-field.warc | 2 + test/test_tests.py | 51 +++- warcio/tester.py | 278 +++++++++++++++--- 3 files changed, 276 insertions(+), 55 deletions(-) diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc index 126ba964..a928a4c4 100644 --- a/test/data/standard-torture-validate-field.warc +++ b/test/data/standard-torture-validate-field.warc @@ -33,6 +33,8 @@ WARC-Segment-Total-Length: 0 WARC-Segment-Total-Length: not-an-integer WARC-Refers-To-Target-URI: http://example.com WARC-Refers-To-Date: not-a-date +WARC-Refers-To-Filename: asdf +WARC-Refers-To-File-Offset: 1234 WARC-Unknown-Field: asdf Content-Length: 0 diff --git a/test/test_tests.py b/test/test_tests.py index 89851eca..ebbdb509 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -6,6 +6,14 @@ from .test_cli import patch_stdout +file_map = {} + + +def map_test_file(filename): + file_map[filename] = get_test_file(filename) + return file_map[filename] + + def helper(args, expected_exit_value): with patch_stdout() as buff: exit_value = None @@ -22,17 +30,16 @@ def helper(args, expected_exit_value): def remove_before_test_data(s): ret = '' for line in s.splitlines(True): - if '/test/data/' in line: - line = 'test/data/' + line.split('/test/data/', 1)[1] - if '\\test\\data\\' in line: - line = 'test/data/' + line.split('\\test\\data\\', 1)[1] + for filename, value in file_map.items(): + if value in line: + line = line.replace(value, 'test/data/' + filename) ret += line return ret def test_torture_validate_record(): files = ['standard-torture-validate-record.warc'] - files = [get_test_file(filename) for filename in files] + files = [map_test_file(filename) for filename in files] args = ['test'] args.extend(files) @@ -55,7 +62,7 @@ def test_torture_validate_record(): WARC-Type warcinfo digest not present error: missing required header: WARC-Date - comment: warc-fields body present but empty + comment: warc-fields block present but empty WARC-Record-ID WARC-Type warcinfo digest not present @@ -67,6 +74,7 @@ def test_torture_validate_record(): error: missing required header: WARC-Date error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain error: WARC-IP-Address should be used for http and https responses + error: http/https responses should have http headers WARC-Record-ID WARC-Type resource digest not present @@ -97,7 +105,7 @@ def test_torture_validate_record(): WARC-Type metadata digest not present error: missing required header: WARC-Date - comment: warc-fields body present but empty + comment: warc-fields block present but empty WARC-Record-ID WARC-Type metadata digest not present @@ -108,7 +116,7 @@ def test_torture_validate_record(): error: missing required header: Content-Type error: missing required header: WARC-Date error: missing required header: WARC-Target-URI - comment: extension seen: WARC-Profile none + comment: unknown value, perhaps an extension: WARC-Profile none comment: no revisit details validation done due to unknown profile: none WARC-Record-ID WARC-Type revisit @@ -120,7 +128,7 @@ def test_torture_validate_record(): recommendation: missing recommended header: WARC-Refers-To recommendation: missing recommended header: WARC-Refers-To-Date recommendation: missing recommended header: WARC-Refers-To-Target-URI - comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest WARC-Record-ID WARC-Type revisit digest not present @@ -161,7 +169,7 @@ def test_torture_validate_record(): def test_torture_validate_field(): files = ['standard-torture-validate-field.warc'] - files = [get_test_file(filename) for filename in files] + files = [map_test_file(filename) for filename in files] args = ['test'] args.extend(files) @@ -219,10 +227,12 @@ def test_torture_validate_field(): comment: unknown WARC-Type: WARC-Type CAPITALIZED comment: unknown digest algorithm: WARC-Block-Digest asdf comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ - comment: extension seen: WARC-Truncated invalid - comment: extension seen: WARC-Profile asdf + comment: unknown value, perhaps an extension: WARC-Truncated invalid + comment: unknown value, perhaps an extension: WARC-Profile asdf comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234 comment: unknown field, no validation performed: WARC-Unknown-Field asdf WARC-Record-ID None WARC-Type invalid @@ -240,6 +250,11 @@ def test_torture_validate_field(): error: missing required header: WARC-Record-ID error: missing required header: WARC-Target-URI recommendation: do not segment WARC-Type request +global warcinfo checks + comment: WARC-Warcinfo-ID not found: WARC-Warcinfo-ID asdf:asdf +global Concurrent-To checks + comment: WARC-Concurrent-To not found: WARC-Concurrent-To + comment: WARC-Concurrent-To not found: WARC-Concurrent-To http://example.com/ """ value = helper(args, 0) @@ -251,7 +266,7 @@ def test_torture_validate_field(): def test_arc(): files = ['does-not-exist.arc'] - files = [get_test_file(filename) for filename in files] + files = [map_test_file(filename) for filename in files] args = ['test'] args.extend(files) @@ -267,7 +282,7 @@ def test_arc(): def test_digests(): # needed for test coverage files = ['example-digest-bad.warc', 'example.warc'] - files = [get_test_file(filename) for filename in files] + files = [map_test_file(filename) for filename in files] args = ['test'] args.extend(files) @@ -282,23 +297,28 @@ def test_digests(): WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc WARC-Record-ID WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc WARC-Record-ID WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc test/data/example.warc WARC-Record-ID WARC-Type request digest not present error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: found in files test/data/example.warc test/data/example-digest-bad.warc WARC-Record-ID WARC-Type revisit digest present but not checked recommendation: missing recommended header: WARC-Refers-To + comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z WARC-Record-ID @@ -318,12 +338,11 @@ def test_leftovers(): # hard to test because invalid WARC Content-Length raises in archiveiterator warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) - # hard to test because warcio checks the WARC version + # hard to test because warcio raises for unknown WARC version warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) expected = '''\ error: must be an integer: Content-Length not-an-integer -comment: no profile check because unknown warc version: blah blah ''' assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/warcio/tester.py b/warcio/tester.py index 8e9d8da3..870c7d6e 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -3,14 +3,15 @@ import re import sys import six +from collections import defaultdict from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester from warcio.exceptions import ArchiveLoadFailed -class Commentary: - def __init__(self, record_id, rec_type): +class Commentary(object): + def __init__(self, record_id=None, rec_type=None): self._record_id = record_id self._rec_type = rec_type self.errors = [] @@ -37,6 +38,7 @@ def has_comments(self): return True def comments(self): + # XXX str() all of these, in case an int or other thing slips in? for e in self.errors: yield 'error: ' + ' '.join(e) for r in self.recommendations: @@ -55,6 +57,13 @@ def __getattr__(self, name): if self._content is None: self._content = self.obj.content_stream().read() return self._content + if name == 'stream_for_digest_check': + def _doit(): + while True: + piece = self.obj.content_stream().read(1024*1024) + if len(piece) == 0: + break + return _doit return getattr(self.__dict__['obj'], name) @@ -117,7 +126,7 @@ def validate_warc_fields(record, commentary): first_line = False if not lines: - commentary.comment('warc-fields body present but empty') + commentary.comment('warc-fields block present but empty') return # check known fields @@ -126,6 +135,7 @@ def validate_warc_fields(record, commentary): def validate_warcinfo(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() != 'application/warc-fields': + # https://github.com/iipc/warc-specifications/issues/33 -- SHALL BE or recommended? commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type) else: # format: warc-fields @@ -137,8 +147,8 @@ def validate_warcinfo(record, commentary, pending): validate_warc_fields(record, commentary) # whole-file tests: - # optional that warcinfo be first in file, still deserves a comment - # allowable for warcinfo to appear anywhere + # recommended that all files start with warcinfo + # elsewise allowable for warcinfo to appear anywhere def validate_response(record, commentary, pending): @@ -152,10 +162,32 @@ def validate_response(record, commentary, pending): if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') - # error: http and https schemes should have http response headers - # test by attempting to parse them? + if not record.http_headers: + commentary.error('http/https responses should have http headers') + return - # comment: verify http content-length, if present -- commoncrawl nutch bug + http_content_length = record.http_headers.get_header('Content-Length') + if http_content_length is None: + return + + if not http_content_length.isdigit(): + commentary.comment('http content length header is not an integer', str(http_content_length)) + return + + # We want to verify http_content_length, which is the size of the compressed payload + # Trying to catch that commoncrawl nutch bug that prefixed /r/n to the payload without changing http content-length + + # this blecherous hack is because we need the length of the (possibly compressed) raw stream + # without reading any of it (so that it can be read elsewhere to check the payload digest) + + # XXX fix me before shipping :-D + + if hasattr(record, 'raw_stream'): + if hasattr(record.raw_stream, 'stream'): + if hasattr(record.raw_stream.stream, 'limit'): + if int(http_content_length) != record.raw_stream.stream.limit: + commentary.comment('Actual http payload length is different from http header Content-Length:', + str(record.raw_stream.stream.limit), http_content_length) def validate_resource(record, commentary, pending): @@ -171,6 +203,7 @@ def validate_resource(record, commentary, pending): pass # should never have http headers + # heuristic of looking for an http status line? and then a blank line?! def validate_request(record, commentary, pending): @@ -193,6 +226,8 @@ def validate_request(record, commentary, pending): def validate_metadata(record, commentary, pending): content_type = record.rec_headers.get_header('Content-Type', 'none') if content_type.lower() == 'application/warc-fields': + # https://github.com/iipc/warc-specifications/issues/33 SHALL be or not? + # # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6 # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it # hopsFromSeed: string @@ -206,8 +241,11 @@ def validate_revisit(record, commentary, pending): if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'): config = { 'required': ['WARC-Payload-Digest'], - 'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'], + 'recommended': ['WARC-Refers-To'], } + if '/1.1/' in warc_profile: + config['recommended'].extend(('WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date')) + validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True) # may have record block; # if not, shall have Content-Length: 0, @@ -282,7 +320,6 @@ def validate_bracketed_uri(field, value, record, version, commentary, pending): def validate_record_id(field, value, record, version, commentary, pending): validate_bracketed_uri(field, value, record, version, commentary, pending) - # TODO: should be "globally unique for its period of intended use" def validate_timestamp(field, value, record, version, commentary, pending): @@ -328,8 +365,6 @@ def validate_content_type(field, value, record, version, commentary, pending): # at this point there can be multiple parameters, # some of which could have quoted string values with ; in them - # TODO: more checking - def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: @@ -370,37 +405,45 @@ def validate_ip(field, value, record, version, commentary, pending): def validate_truncated(field, value, record, version, commentary, pending): if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: - commentary.comment('extension seen:', field, value) + commentary.comment('unknown value, perhaps an extension:', field, value) def validate_warcinfo_id(field, value, record, version, commentary, pending): validate_bracketed_uri(field, value, record, version, commentary, pending) - # TODO: should point at a warcinfo record def validate_filename(field, value, record, version, commentary, pending): - # TODO: text or quoted-string + # text or quoted-string + # comment for dangerous utf-8 in filename? pass profiles = { - # XXX WARC/0.17 and WARC/0.18 + '0.17': ['http://netpreserve.org/warc/0.17/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/0.17/revisit/server-not-modified'], + '0.18': ['http://netpreserve.org/warc/0.18/revisit/identical-payload-digest', + 'http://netpreserve.org/warc/0.18/revisit/server-not-modified'], '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest', 'http://netpreserve.org/warc/1.0/revisit/server-not-modified', - # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not? - # https://github.com/iipc/webarchive-commons/commits/988bec707c27a01333becfc3bd502af4441ea1e1/src/main/java/org/archive/format/warc/WARCConstants.java 'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'], '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest', 'http://netpreserve.org/warc/1.1/revisit/server-not-modified'], } +profiles_rev = dict([(filename, version) for version, filenames in profiles.items() for filename in filenames]) def validate_profile(field, value, record, version, commentary, pending): if version not in profiles: - commentary.comment('no profile check because unknown warc version:', field, value) return - if value not in profiles[version]: - commentary.comment('extension seen:', field, value) + + if value in profiles_rev: + if profiles_rev[value] != version: + commentary.comment('WARC-Profile value is for a different version:', version, value) + else: + commentary.comment('unknown value, perhaps an extension:', field, value) + + if '/revisit/uri-agnostic-identical-payload-digest' in value: + commentary.comment('This Heretrix extension never made it into the standard:', field, value) def validate_segment_number(field, value, record, version, commentary, pending): @@ -427,6 +470,14 @@ def validate_segment_total_length(field, value, record, version, commentary, pen commentary.error('must be an integer:', field, value) +def validate_refers_to_filename(field, value, record, version, commentary, pending): + commentary.comment('This Heretrix extension never made it into the standard:', field, value) + + +def validate_refers_to_file_offset(field, value, record, version, commentary, pending): + commentary.comment('This Heretrix extension never made it into the standard:', field, value) + + warc_fields = { 'WARC-Type': { 'validate': validate_warc_type, @@ -493,6 +544,12 @@ def validate_segment_total_length(field, value, record, version, commentary, pen 'validate': validate_timestamp, 'minver': '1.1', }, + 'WARC-Refers-To-Filename': { + 'validate': validate_refers_to_filename, + }, + 'WARC-Refers-To-File-Offset': { + 'validate': validate_refers_to_file_offset, + }, } warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()]) @@ -579,13 +636,13 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored')) prohibited = make_header_set(config, ('prohibited',)) - for field, value in rec_headers.headers: + for field, value in rec_headers.headers: # XXX not exported fl = field.lower() if fl in prohibited: commentary.error('field not allowed in record type:', rec_type, field) elif allow_all or fl in allowed: pass - elif fl in warc_fields: # pragma: no cover (this is a configuration error, if it happens) + elif fl in warc_fields: # pragma: no cover (this is a tester.py configuration omission) commentary.comment('Known field, but not expected for this record type:', rec_type, field) else: # an 'unknown field' comment has already been issued in validate_record @@ -598,15 +655,15 @@ def validate_record_against_rec_type(config, record, commentary, pending): def validate_record(record): - version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported? + version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported record_id = record.rec_headers.get_header('WARC-Record-ID') rec_type = record.rec_headers.get_header('WARC-Type') - commentary = Commentary(record_id, rec_type) + commentary = Commentary(record_id=record_id, rec_type=rec_type) pending = None seen_fields = set() - for field, value in record.rec_headers.headers: + for field, value in record.rec_headers.headers: # XXX not exported field_l = field.lower() if field_l != 'warc-concurrent-to' and field_l in seen_fields: commentary.error('duplicate field seen:', field, value) @@ -617,13 +674,13 @@ def validate_record(record): config = warc_fields[field_l] if 'minver' in config: if version < config['minver']: - # unknown fields are extensions, so this is a comment and not an error commentary.comment('field was introduced after this warc version:', version, field, value) if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) if rec_type not in record_types: - pass # we print a comment for this elsewhere + # we print a comment for this elsewhere + pass else: validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary) validate_record_against_rec_type(record_types[rec_type], record, commentary, pending) @@ -631,10 +688,149 @@ def validate_record(record): return commentary -def _process_one(warc): - if warc.endswith('.arc') or warc.endswith('.arc.gz'): +def save_global_info(record, warcfile, commentary, all_records, concurrent_to): + record_id = record.rec_headers.get_header('WARC-Record-ID') + if record_id is None: return - with open(warc, 'rb') as stream: + + for field, value in record.rec_headers.headers: # XXX not exported + if field.lower() == 'warc-concurrent-to': + if record_id is not None and value is not None: + concurrent_to[record_id].append(value) + concurrent_to[value].append(record_id) + + save = {'warcfile': warcfile} + + saved_fields = ( + 'WARC-Type', 'WARC-Warcinfo-ID', 'WARC-Date' + 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Payload-Digest', 'WARC-Target-URI', + 'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Segment-Total-Length', 'WARC-Truncated' + ) + saved_fields = set([x.lower() for x in saved_fields]) + + for field, value in record.rec_headers.headers: # XXX not exported + field_l = field.lower() + if field_l in saved_fields and value is not None: + save[field_l] = value + if field_l == 'warc-concurrent-to': + if 'warc-concurrent-to' not in save: + save['warc-concurrent-to'] = [] + save['warc-concurrent-to'].append(value) + + if record_id in all_records: + commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile']) + else: + all_records[record_id] = save + + +def check_global(all_records, concurrent_to): + check_global_warcinfo(all_records) + check_global_concurrent_to(all_records, concurrent_to) + check_global_refers_to(all_records) + check_global_segment(all_records) + + +def _print_global(header, commentary): + if commentary.has_comments(): + print(header) + for c in commentary.comments(): + print(' ', c) + + +def check_global_warcinfo(all_records): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-warcinfo-id' in fields: + wanted_id = fields['warc-warcinfo-id'] + if wanted_id not in all_records or all_records[wanted_id]['warc-type'] != 'warcinfo': + commentary.comment('WARC-Warcinfo-ID not found:', record_id, 'WARC-Warcinfo-ID', wanted_id) + + _print_global('global warcinfo checks', commentary) + + +def check_global_concurrent_to(all_records, concurrent_to): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-concurrent-to' in fields: + whole_set = set(fields['warc-concurrent-to']) + del fields['warc-concurrent-to'] + while True: + current_set = list(whole_set) + for c in current_set: + if c in all_records and 'warc-concurrent-to' in all_records[c]: + whole_set.update(set(all_records[c]['warc-concurrent-to'])) + del all_records[c]['warc-concurrent-to'] + if len(whole_set) == len(current_set): + break + warc_date = fields.get('warc-date') + for wanted_id in sorted(whole_set): + if wanted_id not in all_records: + commentary.comment('WARC-Concurrent-To not found:', record_id, 'WARC-Concurrent-To', wanted_id) + else: + new_date = all_records[wanted_id].get('warc-date') + if warc_date != new_date: + commentary.comment('WARC-Concurrent-To set has conflicting dates:', + record_id, warc_date, wanted_id, new_date) + + _print_global('global Concurrent-To checks', commentary) + + +def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, target_field, commentary): + if source_field.lower() not in fields: + return + + if target_field.lower() not in all_records[wanted_id]: + commentary.comment('revisit target lacks field:', wanted_id, target_field) + return + + source_value = fields[source_field.lower()] + target_value = all_records[wanted_id][target_field.lower()] + if source_value != target_value: + commentary.comment('revisit and revisit target disagree:', + record_id, source_field, source_value, + wanted_id, target_field, target_value) + + +def check_global_refers_to(all_records): + commentary = Commentary() + for record_id, fields in all_records.items(): + if 'warc-refers-to' not in fields: + continue + + wanted_id = fields['warc-refers-to'] + if wanted_id not in all_records: + commentary.comment('WARC-Refers-To target not found:', record_id, 'Warc-Refers-To', wanted_id) + continue + + rec_type = fields.get('warc-type') + if rec_type != 'revisit': + continue + + _revisit_compare(record_id, fields, 'WARC-Refers-To-Target-URI', + wanted_id, all_records, 'WARC-Target-URI', commentary) + _revisit_compare(record_id, fields, 'WARC-Refers-To-Date', + wanted_id, all_records, 'WARC-Date', commentary) + _revisit_compare(record_id, fields, 'WARC-Payload-Digest', + wanted_id, all_records, 'WARC-Payload-Digest', commentary) + + _print_global('global Refers-To checks', commentary) + + +def check_global_segment(all_records): + # warc-segment-origin-id :: exists, is warc-segment-number 1 + # all segments exist, and the last one has WARC-Segment-Total-Length + # and only the last one has WARC-Truncated, if any + + # Segmentation shall not be used if a record can be stored in an existing warc file + # The origin segment shall be placed in a new warc file preceded only by a warcinfo record (if any) + + pass + + +def _process_one(warcfile, all_records, concurrent_to): + if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): + return + with open(warcfile, 'rb') as stream: for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): record = WrapRecord(record) @@ -642,10 +838,9 @@ def _process_one(warc): record.rec_headers.get_header('WARC-Block-Digest')) commentary = validate_record(record) + save_global_info(record, warcfile, commentary, all_records, concurrent_to) - record.content # make sure digests are checked - # XXX might need to read and digest the raw stream to check digests for chunked encoding? - # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes + record.stream_for_digest_check() if commentary.has_comments() or record.digest_checker.passed is False: print(' ', 'WARC-Record-ID', commentary.record_id()) @@ -671,16 +866,21 @@ class Tester(object): def __init__(self, cmd): self.inputs = cmd.inputs self.exit_value = 0 + self.all_records = defaultdict(dict) + self.concurrent_to = defaultdict(list) def process_all(self): - for warc in self.inputs: - print(warc) + for warcfile in self.inputs: + print(warcfile) try: - self.process_one(warc) + self.process_one(warcfile) except ArchiveLoadFailed as e: print(' saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr) print(' skipping rest of file', file=sys.stderr) + + check_global(self.all_records, self.concurrent_to) + return self.exit_value - def process_one(self, filename): - _process_one(filename) + def process_one(self, warcfile): + _process_one(warcfile, self.all_records, self.concurrent_to) From f7cd1dbb28cb4033bd70878729f86479a0643971 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 31 Jan 2019 12:03:44 -0800 Subject: [PATCH 58/68] check -v; capitalize most commentary --- warcio/cli.py | 1 + warcio/tester.py | 89 +++++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 42 deletions(-) diff --git a/warcio/cli.py b/warcio/cli.py index ada44f12..bbe51a93 100644 --- a/warcio/cli.py +++ b/warcio/cli.py @@ -55,6 +55,7 @@ def main(args=None): test = subparsers.add_parser('test', help='WARC standards tester') test.add_argument('inputs', nargs='+') + test.add_argument('-v', '--verbose', action='store_true') test.set_defaults(func=tester) cmd = parser.parse_args(args=args) diff --git a/warcio/tester.py b/warcio/tester.py index 870c7d6e..9605ea7b 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -157,7 +157,7 @@ def validate_response(record, commentary, pending): if target_uri.startswith('http:') or target_uri.startswith('https:'): content_type = record.rec_headers.get_header('Content-Type', 'none') if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}: - commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type) + commentary.error('Responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type) if record.rec_headers.get_header('WARC-IP-Address') is None: commentary.error('WARC-IP-Address should be used for http and https responses') @@ -264,7 +264,7 @@ def validate_revisit(record, commentary, pending): # if yes, should be like a response record, truncated if desired # WARC-Refers-To-Date should be the same as WARC-Date in the original record if present else: - commentary.comment('no revisit details validation done due to unknown profile:', warc_profile) + commentary.comment('No revisit details validation done due to unknown profile:', warc_profile) def validate_conversion(record, commentary, pending): @@ -291,14 +291,17 @@ def validate_unbracketed_uri(field, value, record, version, commentary, pending) if value.startswith('<') or value.endswith('>'): # wget 1.19 bug caused by WARC 1.0 spec error commentary.error('uri must not be within <>:', field, value) + value = value[1:-1] + + scheme = value.split(':', 1)[0] if ':' not in value: - commentary.error('invalid uri, no scheme:', field, value) + commentary.error('Invalid uri, no scheme:', field, value) + elif not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): + commentary.error('Invalid uri scheme, bad character:', field, value) + # use https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml ?? + if re.search(r'\s', value): - commentary.error('invalid uri, contains whitespace:', field, value) - scheme = value.split(':', 1)[0] - if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme): - commentary.error('invalid uri scheme, bad character:', field, value) - # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + commentary.error('Invalid uri, contains whitespace:', field, value) def validate_warc_type(field, value, record, version, commentary, pending): @@ -307,7 +310,7 @@ def validate_warc_type(field, value, record, version, commentary, pending): commentary.comment('WARC-Type is not lower-case:', field, value) if value.lower() not in record_types: # standard says readers should ignore unknown warc-types - commentary.comment('unknown WARC-Type:', field, value) + commentary.comment('Unknown WARC-Type:', field, value) def validate_bracketed_uri(field, value, record, version, commentary, pending): @@ -337,7 +340,7 @@ def validate_timestamp(field, value, record, version, commentary, pending): def validate_content_length(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer:', field, value) + commentary.error('Must be an integer:', field, value) token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z' @@ -346,7 +349,7 @@ def validate_content_length(field, value, record, version, commentary, pending): def validate_content_type(field, value, record, version, commentary, pending): if '/' not in value: - commentary.error('must contain a /:', field, value) + commentary.error('Must contain a /:', field, value) splits = value.split('/', 1) ctype = splits[0] if len(splits) > 1: @@ -354,13 +357,13 @@ def validate_content_type(field, value, record, version, commentary, pending): else: rest = '' if not re.search(token_re, ctype): - commentary.error('invalid type:', field, value) + commentary.error('Invalid type:', field, value) if ';' in rest: subtype, rest = rest.split(';', 1) else: subtype = rest if not re.search(token_re, subtype): - commentary.error('invalid subtype:', field, value) + commentary.error('Invalid subtype:', field, value) # at this point there can be multiple parameters, # some of which could have quoted string values with ; in them @@ -368,7 +371,7 @@ def validate_content_type(field, value, record, version, commentary, pending): def validate_digest(field, value, record, version, commentary, pending): if ':' not in value: - commentary.error('missing algorithm:', field, value) + commentary.error('Missing algorithm:', field, value) splits = value.split(':', 1) algorithm = splits[0] if len(splits) > 1: @@ -376,12 +379,12 @@ def validate_digest(field, value, record, version, commentary, pending): else: digest = 'none' if not re.search(token_re, algorithm): - commentary.error('invalid algorithm:', field, value) + commentary.error('Invalid algorithm:', field, value) else: try: Digester(algorithm) except ValueError: - commentary.comment('unknown digest algorithm:', field, value) + commentary.comment('Unknown digest algorithm:', field, value) if not re.search(token_re, digest): # https://github.com/iipc/warc-specifications/issues/48 # commentary.comment('spec incorrectly says this is an invalid digest', field, value) @@ -398,14 +401,14 @@ def validate_ip(field, value, record, version, commentary, pending): value = unicode(value) ipaddress.ip_address(value) except ValueError: - commentary.error('invalid ip:', field, value) + commentary.error('Invalid ip:', field, value) except (ImportError, NameError): # pragma: no cover - commentary.comment('did not check ip address format, install ipaddress module from pypi if you care') + commentary.comment('Did not check ip address format, install ipaddress module from pypi if you care') def validate_truncated(field, value, record, version, commentary, pending): if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}: - commentary.comment('unknown value, perhaps an extension:', field, value) + commentary.comment('Unknown value, perhaps an extension:', field, value) def validate_warcinfo_id(field, value, record, version, commentary, pending): @@ -440,7 +443,7 @@ def validate_profile(field, value, record, version, commentary, pending): if profiles_rev[value] != version: commentary.comment('WARC-Profile value is for a different version:', version, value) else: - commentary.comment('unknown value, perhaps an extension:', field, value) + commentary.comment('Unknown value, perhaps an extension:', field, value) if '/revisit/uri-agnostic-identical-payload-digest' in value: commentary.comment('This Heretrix extension never made it into the standard:', field, value) @@ -448,26 +451,26 @@ def validate_profile(field, value, record, version, commentary, pending): def validate_segment_number(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer:', field, value) + commentary.error('Must be an integer:', field, value) return iv = int(value) if iv == 0: - commentary.error('must be 1 or greater:', field, value) + commentary.error('Must be 1 or greater:', field, value) rec_type = record.rec_headers.get_header('WARC-Type', 'none') if rec_type != 'continuation': if iv != 1: - commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value) + commentary.error('Non-continuation records must always have WARC-Segment-Number: 1:', field, value) origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID') if origin_id is None: - commentary.error('segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID') + commentary.error('Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID') if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}: - commentary.recommendation('do not segment WARC-Type', rec_type) + commentary.recommendation('Do not segment WARC-Type', rec_type) def validate_segment_total_length(field, value, record, version, commentary, pending): if not value.isdigit(): - commentary.error('must be an integer:', field, value) + commentary.error('Must be an integer:', field, value) def validate_refers_to_filename(field, value, record, version, commentary, pending): @@ -525,6 +528,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe 'validate': validate_profile, }, 'WARC-Identified-Payload-Type': { + # see also https://github.com/iipc/warc-specifications/issues/49 -- odd that it's allowed for request, revisit, continuation 'validate': validate_content_type, }, 'WARC-Segment-Origin-ID': { @@ -565,7 +569,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type', 'WARC-Target-URI'], 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated', - 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_response, }, @@ -605,7 +609,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe }, 'conversion': { 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'], - 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], + 'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'], 'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], 'validate': validate_conversion, }, @@ -613,7 +617,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe 'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'], 'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'], - 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'], + 'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile', 'WARC-Identified-Payload-Type'], 'validate': validate_continuation, }, } @@ -629,17 +633,17 @@ def make_header_set(config, kinds): def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False): for req in sorted(config.get('required', [])): if not rec_headers.get_header(req): - commentary.error('missing required header:', req) + commentary.error('Missing required header:', req) for rec in sorted(config.get('recommended', [])): if not rec_headers.get_header(rec): - commentary.recommendation('missing recommended header:', rec) + commentary.recommendation('Missing recommended header:', rec) allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored')) prohibited = make_header_set(config, ('prohibited',)) for field, value in rec_headers.headers: # XXX not exported fl = field.lower() if fl in prohibited: - commentary.error('field not allowed in record type:', rec_type, field) + commentary.error('Field not allowed in record type:', rec_type, field) elif allow_all or fl in allowed: pass elif fl in warc_fields: # pragma: no cover (this is a tester.py configuration omission) @@ -666,15 +670,15 @@ def validate_record(record): for field, value in record.rec_headers.headers: # XXX not exported field_l = field.lower() if field_l != 'warc-concurrent-to' and field_l in seen_fields: - commentary.error('duplicate field seen:', field, value) + commentary.error('Duplicate field seen:', field, value) seen_fields.add(field_l) if field_l not in warc_fields: - commentary.comment('unknown field, no validation performed:', field, value) + commentary.comment('Unknown field, no validation performed:', field, value) continue config = warc_fields[field_l] if 'minver' in config: if version < config['minver']: - commentary.comment('field was introduced after this warc version:', version, field, value) + commentary.comment('Field was introduced after this warc version:', version, field, value) if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) @@ -780,13 +784,13 @@ def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, ta return if target_field.lower() not in all_records[wanted_id]: - commentary.comment('revisit target lacks field:', wanted_id, target_field) + commentary.comment('Revisit target lacks field:', wanted_id, target_field) return source_value = fields[source_field.lower()] target_value = all_records[wanted_id][target_field.lower()] if source_value != target_value: - commentary.comment('revisit and revisit target disagree:', + commentary.comment('Revisit and revisit target disagree:', record_id, source_field, source_value, wanted_id, target_field, target_value) @@ -827,7 +831,7 @@ def check_global_segment(all_records): pass -def _process_one(warcfile, all_records, concurrent_to): +def _process_one(warcfile, all_records, concurrent_to, verbose): if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): return with open(warcfile, 'rb') as stream: @@ -842,7 +846,7 @@ def _process_one(warcfile, all_records, concurrent_to): record.stream_for_digest_check() - if commentary.has_comments() or record.digest_checker.passed is False: + if verbose or commentary.has_comments() or record.digest_checker.passed is False: print(' ', 'WARC-Record-ID', commentary.record_id()) print(' ', 'WARC-Type', commentary.rec_type()) @@ -865,6 +869,7 @@ def _process_one(warcfile, all_records, concurrent_to): class Tester(object): def __init__(self, cmd): self.inputs = cmd.inputs + self.verbose = cmd.verbose self.exit_value = 0 self.all_records = defaultdict(dict) self.concurrent_to = defaultdict(list) @@ -875,12 +880,12 @@ def process_all(self): try: self.process_one(warcfile) except ArchiveLoadFailed as e: - print(' saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr) - print(' skipping rest of file', file=sys.stderr) + print(' saw exception ArchiveLoadFailed: '+str(e).rstrip()) + print(' skipping rest of file') check_global(self.all_records, self.concurrent_to) return self.exit_value def process_one(self, warcfile): - _process_one(warcfile, self.all_records, self.concurrent_to) + _process_one(warcfile, self.all_records, self.concurrent_to, self.verbose) From b570b6c09cd9e32110b5127721e4676ac7405d02 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 31 Jan 2019 21:49:41 -0800 Subject: [PATCH 59/68] ... --- test/test_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_tests.py b/test/test_tests.py index ebbdb509..9c3c9fec 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -342,7 +342,7 @@ def test_leftovers(): warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) expected = '''\ -error: must be an integer: Content-Length not-an-integer +error: Must be an integer: Content-Length not-an-integer ''' assert '\n'.join(commentary.comments())+'\n' == expected From 921e7486d2ef84da7684833acce41ae3957710dc Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 1 Feb 2019 10:25:40 -0800 Subject: [PATCH 60/68] revisits and global detection with just one file --- warcio/tester.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index 9605ea7b..68f108b2 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -722,7 +722,10 @@ def save_global_info(record, warcfile, commentary, all_records, concurrent_to): save['warc-concurrent-to'].append(value) if record_id in all_records: - commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile']) + if warcfile != all_records[record_id]['warcfile']: + commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile']) + else: + commentary.error('Duplicate WARC-Record-ID:', record_id) else: all_records[record_id] = save @@ -853,9 +856,12 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): if record.digest_checker.passed is True: print(' digest pass') elif record.digest_checker.passed is None: - if digest_present: # pragma: no cover - # WARC record missing Content-Length: header, which is verboten - print(' digest present but not checked') + if digest_present: + if commentary.rec_type() == 'revisit': + print(' digest present but not checked (revisit)') + else: # pragma: no cover + # WARC record missing Content-Length: header, which is verboten + print(' digest present but not checked') else: print(' digest not present') for p in record.digest_checker.problems: From 4265b62e5450cdb0b9d694a0d9e76124b2c4ec8c Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 1 Feb 2019 15:47:01 -0800 Subject: [PATCH 61/68] show errors for decompression and unchunking failures --- test/test_tests.py | 14 +++++++------- warcio/archiveiterator.py | 5 +++-- warcio/bufferedreaders.py | 17 ++++++++++++++--- warcio/recordloader.py | 10 ++++++---- warcio/tester.py | 12 ++++++++++-- 5 files changed, 40 insertions(+), 18 deletions(-) diff --git a/test/test_tests.py b/test/test_tests.py index 9c3c9fec..200df8ae 100644 --- a/test/test_tests.py +++ b/test/test_tests.py @@ -297,17 +297,17 @@ def test_digests(): WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc + error: Duplicate WARC-Record-ID: WARC-Record-ID WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc + error: Duplicate WARC-Record-ID: WARC-Record-ID WARC-Type request digest pass error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc + error: Duplicate WARC-Record-ID: test/data/example.warc WARC-Record-ID WARC-Type request @@ -316,11 +316,11 @@ def test_digests(): error: Duplicate WARC-Record-ID: found in files test/data/example.warc test/data/example-digest-bad.warc WARC-Record-ID WARC-Type revisit - digest present but not checked - recommendation: missing recommended header: WARC-Refers-To + digest present but not checked (revisit) + recommendation: Missing recommended header: WARC-Refers-To comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest - comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ - comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z WARC-Record-ID WARC-Type request digest not present diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py index 24094936..5e9c02ca 100644 --- a/warcio/archiveiterator.py +++ b/warcio/archiveiterator.py @@ -56,13 +56,14 @@ class ArchiveIterator(six.Iterator): def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE, - check_digests=False, fixup_bugs=True): + check_digests=False, fixup_bugs=True, raise_exceptions=False): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc, - fixup_bugs=fixup_bugs) + fixup_bugs=fixup_bugs, + raise_exceptions=raise_exceptions) self.known_format = None self.mixed_arc_warc = arc2warc diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 0b7f72f7..734ce23a 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -36,6 +36,13 @@ def brotli_decompressor(): pass +#================================================================= +class DecompressionException(Exception): + def __init__(self, msg, data=b''): + Exception.__init__(self, msg) + self.data = data + + #================================================================= class BufferedReader(object): """ @@ -64,7 +71,8 @@ class BufferedReader(object): def __init__(self, stream, block_size=BUFF_SIZE, decomp_type=None, starting_data=None, - read_all_members=False): + read_all_members=False, + raise_exceptions=False): self.stream = stream self.block_size = block_size @@ -77,6 +85,7 @@ def __init__(self, stream, block_size=BUFF_SIZE, self.buff_size = 0 self.read_all_members = read_all_members + self.raise_exceptions = raise_exceptions def set_decomp(self, decomp_type): self._init_decomp(decomp_type) @@ -142,6 +151,8 @@ def _decompress(self, data): self._init_decomp('deflate_alt') data = self._decompress(data) else: + if self.raise_exceptions: + raise DecompressionException(str(e)) self.decompressor = None # otherwise (partly decompressed), something is wrong else: @@ -280,13 +291,13 @@ class ChunkedDataReader(BufferedReader): If at any point the chunked header is not available, the stream is assumed to not be chunked and no more dechunking occurs. """ - def __init__(self, stream, raise_exceptions=False, **kwargs): + def __init__(self, stream, **kwargs): super(ChunkedDataReader, self).__init__(stream, **kwargs) self.all_chunks_read = False self.not_chunked = False # if False, we'll use best-guess fallback for parse errors - self.raise_chunked_data_exceptions = raise_exceptions + self.raise_chunked_data_exceptions = kwargs.get('raise_exceptions') def _fillbuff(self, block_size=None): if self.not_chunked: diff --git a/warcio/recordloader.py b/warcio/recordloader.py index 2f48233b..3903f4b1 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -23,6 +23,7 @@ def __init__(self, *args, **kwargs): self.http_headers, self.content_type, self.length) = args self.payload_length = -1 self.digest_checker = kwargs.get('digest_checker') + self.raise_exceptions = kwargs.get('raise_exceptions') def content_stream(self): if not self.http_headers: @@ -37,9 +38,9 @@ def content_stream(self): encoding = None if self.http_headers.get_header('transfer-encoding') == 'chunked': - return ChunkedDataReader(self.raw_stream, decomp_type=encoding) + return ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) elif encoding: - return BufferedReader(self.raw_stream, decomp_type=encoding) + return BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) else: return self.raw_stream @@ -58,7 +59,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): + def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_exceptions=False): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -69,6 +70,7 @@ def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) self.fixup_bugs = fixup_bugs + self.raise_exceptions = raise_exceptions def parse_record_stream(self, stream, statusline=None, @@ -150,7 +152,7 @@ def parse_record_stream(self, stream, return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, - content_type, length, digest_checker=digest_checker) + content_type, length, digest_checker=digest_checker, raise_exceptions=self.raise_exceptions) def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None): payload_digest = rec_headers.get_header('WARC-Payload-Digest') diff --git a/warcio/tester.py b/warcio/tester.py index 68f108b2..84167c4c 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -8,6 +8,7 @@ from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester from warcio.exceptions import ArchiveLoadFailed +from warcio.bufferedreaders import ChunkedDataException, DecompressionException class Commentary(object): @@ -838,7 +839,7 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): return with open(warcfile, 'rb') as stream: - for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): + for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True): record = WrapRecord(record) digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or @@ -847,7 +848,14 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): commentary = validate_record(record) save_global_info(record, warcfile, commentary, all_records, concurrent_to) - record.stream_for_digest_check() + try: + record.stream_for_digest_check() + except ChunkedDataException: + commentary.error('Transfer-Encoding: chunked, saw an error attempting to unchunk') + pass + except DecompressionException as e: + commentary.error('Content-Encoding indicates compression, saw an error attempting to decompress: '+str(e)) + pass if verbose or commentary.has_comments() or record.digest_checker.passed is False: print(' ', 'WARC-Record-ID', commentary.record_id()) From 08e6bd9c88ab743c1794d15b7ec79711d6db6808 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 1 Feb 2019 22:13:07 -0800 Subject: [PATCH 62/68] make this function reentrant --- warcio/recordloader.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/warcio/recordloader.py b/warcio/recordloader.py index 3903f4b1..d5523f75 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -24,11 +24,15 @@ def __init__(self, *args, **kwargs): self.payload_length = -1 self.digest_checker = kwargs.get('digest_checker') self.raise_exceptions = kwargs.get('raise_exceptions') + self._content_stream = None def content_stream(self): if not self.http_headers: return self.raw_stream + if self._content_stream: + return self._content_stream + encoding = self.http_headers.get_header('content-encoding') if encoding: @@ -38,11 +42,13 @@ def content_stream(self): encoding = None if self.http_headers.get_header('transfer-encoding') == 'chunked': - return ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) + self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) elif encoding: - return BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) + self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) else: - return self.raw_stream + self._content_stream = self.raw_stream + + return self._content_stream #================================================================= From d1f48ed5dc108b6038e9e54cc73788556e75ab2e Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Fri, 1 Feb 2019 22:13:26 -0800 Subject: [PATCH 63/68] narrow exception; fix bug not reading to the end of a chunked buffer --- warcio/bufferedreaders.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 734ce23a..74adae51 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -38,9 +38,8 @@ def brotli_decompressor(): #================================================================= class DecompressionException(Exception): - def __init__(self, msg, data=b''): + def __init__(self, msg): Exception.__init__(self, msg) - self.data = data #================================================================= @@ -144,7 +143,7 @@ def _decompress(self, data): if self.decompressor and data: try: data = self.decompressor.decompress(data) - except Exception as e: + except zlib.error as e: # if first read attempt, assume non-gzipped stream if self.num_block_read == 0: if self.decomp_type == 'deflate': From 6e44a44af2278e09cc35001829a7b9acb34779c9 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sat, 2 Feb 2019 09:30:51 -0800 Subject: [PATCH 64/68] ... --- warcio/tester.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index 84167c4c..84ea75c3 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -840,6 +840,7 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): return with open(warcfile, 'rb') as stream: for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True): + #for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): record = WrapRecord(record) digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or @@ -850,11 +851,11 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): try: record.stream_for_digest_check() - except ChunkedDataException: - commentary.error('Transfer-Encoding: chunked, saw an error attempting to unchunk') + except ChunkedDataException as e: + commentary.comment('Transfer-Encoding: chunked, saw exception: '+str(e)) pass except DecompressionException as e: - commentary.error('Content-Encoding indicates compression, saw an error attempting to decompress: '+str(e)) + commentary.comment('Content-Encoding indicates compression, saw: '+str(e)) pass if verbose or commentary.has_comments() or record.digest_checker.passed is False: From 59198eb4f77292565fe65613fa662cce449f4db6 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Wed, 6 Feb 2019 11:53:02 -0800 Subject: [PATCH 65/68] put tester output in external files --- test/data/example-digest-bad.warc.test | 22 ++ test/data/example.warc.test | 16 + .../standard-torture-validate-field.warc.test | 80 ++++ ...standard-torture-validate-record.warc.test | 112 ++++++ test/test_tester.py | 96 +++++ test/test_tests.py | 348 ------------------ 6 files changed, 326 insertions(+), 348 deletions(-) create mode 100644 test/data/example-digest-bad.warc.test create mode 100644 test/data/example.warc.test create mode 100644 test/data/standard-torture-validate-field.warc.test create mode 100644 test/data/standard-torture-validate-record.warc.test create mode 100644 test/test_tester.py delete mode 100644 test/test_tests.py diff --git a/test/data/example-digest-bad.warc.test b/test/data/example-digest-bad.warc.test new file mode 100644 index 00000000..15a5efaf --- /dev/null +++ b/test/data/example-digest-bad.warc.test @@ -0,0 +1,22 @@ +test/data/example-digest-bad.warc + WARC-Record-ID + WARC-Type request + payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: + WARC-Record-ID + WARC-Type request + digest pass + error: WARC-IP-Address should be used for http and https requests + error: Duplicate WARC-Record-ID: +global Concurrent-To checks + comment: WARC-Concurrent-To not found: WARC-Concurrent-To diff --git a/test/data/example.warc.test b/test/data/example.warc.test new file mode 100644 index 00000000..52b3c79f --- /dev/null +++ b/test/data/example.warc.test @@ -0,0 +1,16 @@ +test/data/example.warc + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type revisit + digest present but not checked (revisit) + recommendation: Missing recommended header: WARC-Refers-To + comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z + WARC-Record-ID + WARC-Type request + digest not present + error: WARC-IP-Address should be used for http and https requests diff --git a/test/data/standard-torture-validate-field.warc.test b/test/data/standard-torture-validate-field.warc.test new file mode 100644 index 00000000..de2e3fe1 --- /dev/null +++ b/test/data/standard-torture-validate-field.warc.test @@ -0,0 +1,80 @@ +test/data/standard-torture-validate-field.warc + WARC-Record-ID + WARC-Type does-not-exist + unknown hash algorithm name in block digest + error: uri must not be within <>: WARC-Target-URI + error: Duplicate field seen: WARC-Target-URI example.com + error: Invalid uri, no scheme: WARC-Target-URI example.com + error: Duplicate field seen: WARC-Target-URI ex ample.com + error: Invalid uri, no scheme: WARC-Target-URI ex ample.com + error: Invalid uri, contains whitespace: WARC-Target-URI ex ample.com + error: Duplicate field seen: WARC-Target-URI h<>ttp://example.com/ + error: Invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/ + error: Duplicate field seen: WARC-Type CAPITALIZED + error: uri must be within <>: WARC-Concurrent-To http://example.com/ + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z + error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z + error: Must contain a /: Content-Type asdf + error: Invalid subtype: Content-Type asdf + error: Duplicate field seen: Content-Type has space/asdf + error: Invalid type: Content-Type has space/asdf + error: Duplicate field seen: Content-Type asdf/has space + error: Invalid subtype: Content-Type asdf/has space + error: Duplicate field seen: Content-Type asdf/has space;asdf + error: Invalid subtype: Content-Type asdf/has space;asdf + error: Missing algorithm: WARC-Block-Digest asdf + error: Duplicate field seen: WARC-Block-Digest has space:asdf + error: Invalid algorithm: WARC-Block-Digest has space:asdf + error: Duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^ + error: Invalid ip: WARC-IP-Address 1.2.3.4.5 + error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf + error: Duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + error: Must contain a /: WARC-Identified-Payload-Type asdf + error: Invalid subtype: WARC-Identified-Payload-Type asdf + error: uri must be within <>: WARC-Segment-Origin-ID http://example.com + error: Must be an integer: WARC-Segment-Number not-an-integer + error: Duplicate field seen: WARC-Segment-Number 0 + error: Must be 1 or greater: WARC-Segment-Number 0 + error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0 + error: Duplicate field seen: WARC-Segment-Number 1 + error: Duplicate field seen: WARC-Segment-Number 2 + error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 + error: Duplicate field seen: WARC-Segment-Total-Length not-an-integer + error: Must be an integer: WARC-Segment-Total-Length not-an-integer + error: Invalid timestamp: WARC-Refers-To-Date not-a-date + comment: Unknown WARC-Type: WARC-Type does-not-exist + comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED + comment: Unknown WARC-Type: WARC-Type CAPITALIZED + comment: Unknown digest algorithm: WARC-Block-Digest asdf + comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ + comment: Unknown value, perhaps an extension: WARC-Truncated invalid + comment: Unknown value, perhaps an extension: WARC-Profile asdf + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com + comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf + comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234 + comment: Unknown field, no validation performed: WARC-Unknown-Field asdf + WARC-Record-ID None + WARC-Type invalid + digest not present + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z + error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z + error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z + comment: Unknown WARC-Type: WARC-Type invalid + WARC-Record-ID None + WARC-Type request + digest not present + error: Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Record-ID + error: Missing required header: WARC-Target-URI + recommendation: Do not segment WARC-Type request + saw exception ArchiveLoadFailed: Invalid WARC record, first line: WARC/invalid + skipping rest of file +global warcinfo checks + comment: WARC-Warcinfo-ID not found: WARC-Warcinfo-ID asdf:asdf +global Concurrent-To checks + comment: WARC-Concurrent-To not found: WARC-Concurrent-To + comment: WARC-Concurrent-To not found: WARC-Concurrent-To http://example.com/ diff --git a/test/data/standard-torture-validate-record.warc.test b/test/data/standard-torture-validate-record.warc.test new file mode 100644 index 00000000..e7b17345 --- /dev/null +++ b/test/data/standard-torture-validate-record.warc.test @@ -0,0 +1,112 @@ +test/data/standard-torture-validate-record.warc + WARC-Record-ID None + WARC-Type warcinfo + digest not present + error: uri must be within <>: WARC-Refers-To probhibited + error: Missing required header: WARC-Date + error: Missing required header: WARC-Record-ID + error: Field not allowed in record type: warcinfo WARC-Refers-To + error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte + comment: The first line of warc-fields cannot start with whitespace + comment: warc-fields lines must end with \r\n: test: lines should end with \r\n + comment: Missing colon in warc-fields line: no colon + comment: Invalid warc-fields name: token cannot have a space + WARC-Record-ID + WARC-Type warcinfo + digest not present + error: Missing required header: WARC-Date + comment: warc-fields block present but empty + WARC-Record-ID + WARC-Type warcinfo + digest not present + error: Missing required header: WARC-Date + recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields + WARC-Record-ID + WARC-Type response + digest not present + error: Missing required header: WARC-Date + error: Responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain + error: WARC-IP-Address should be used for http and https responses + error: http/https responses should have http headers + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: WARC-Date + error: resource records for dns shall have Content-Type of text/dns: text/plain + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: WARC-Date + comment: Unknown field, no validation performed: WARC-Test-TODO add another with valid block + WARC-Record-ID + WARC-Type resource + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + WARC-Record-ID + WARC-Type request + digest not present + error: Missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain + error: WARC-IP-Address should be used for http and https requests + WARC-Record-ID + WARC-Type request + digest not present + error: Missing required header: WARC-Date + error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain + WARC-Record-ID + WARC-Type metadata + digest not present + error: Missing required header: WARC-Date + comment: warc-fields block present but empty + WARC-Record-ID + WARC-Type metadata + digest not present + error: Missing required header: WARC-Date + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + comment: Unknown value, perhaps an extension: WARC-Profile none + comment: No revisit details validation done due to unknown profile: none + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + error: Missing required header: WARC-Payload-Digest + recommendation: Missing recommended header: WARC-Refers-To + recommendation: Missing recommended header: WARC-Refers-To-Date + recommendation: Missing recommended header: WARC-Refers-To-Target-URI + comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest + WARC-Record-ID + WARC-Type revisit + digest not present + error: Missing required header: Content-Type + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + recommendation: Missing recommended header: WARC-Refers-To + recommendation: Missing recommended header: WARC-Refers-To-Date + WARC-Record-ID + WARC-Type conversion + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Target-URI + WARC-Record-ID + WARC-Type continuation + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Segment-Origin-ID + error: Missing required header: WARC-Target-URI + error: continuation record must have WARC-Segment-Number > 1: 1 + comment: warcio test continuation code has not been tested, expect bugs + WARC-Record-ID + WARC-Type continuation + digest not present + error: Missing required header: WARC-Date + error: Missing required header: WARC-Segment-Origin-ID + error: Missing required header: WARC-Target-URI + comment: warcio test continuation code has not been tested, expect bugs diff --git a/test/test_tester.py b/test/test_tester.py new file mode 100644 index 00000000..49b1cc6d --- /dev/null +++ b/test/test_tester.py @@ -0,0 +1,96 @@ +from warcio.cli import main +from warcio.utils import to_native_str +import warcio.tester + +from . import get_test_file +from .test_cli import patch_stdout + + +file_map = {} + + +def map_test_file(filename): + file_map[filename] = get_test_file(filename) + return file_map[filename] + + +def helper(args, expected_exit_value): + with patch_stdout() as buff: + exit_value = None + try: + main(args=args) + except SystemExit as e: + exit_value = e.code + finally: + assert exit_value == expected_exit_value + + return to_native_str(buff.getvalue()) + + +def remove_before_test_data(s): + ret = '' + for line in s.splitlines(True): + for filename, value in file_map.items(): + if value in line: + line = line.replace(value, 'test/data/' + filename) + ret += line + return ret + + +def run_one(f): + args = ['test'] + args.append(f) + + with open(f+'.test', 'r') as expectedf: + expected = expectedf.read() + + value = helper(args, 0) + print(remove_before_test_data(value)) + + actual = remove_before_test_data(value) + + assert actual == expected + + +def test_torture(): + files = ['standard-torture-validate-record.warc', + 'standard-torture-validate-field.warc'] + [run_one(map_test_file(filename)) for filename in files] + + +def test_arc(): + files = ['does-not-exist.arc'] + files = [map_test_file(filename) for filename in files] + + args = ['test'] + args.extend(files) + + expected = """\ +test/data/does-not-exist.arc +""" + + value = helper(args, 0) + assert remove_before_test_data(value) == expected + + +def test_digests(): + # needed for test coverage + files = ['example-digest-bad.warc', 'example.warc'] + [run_one(map_test_file(filename)) for filename in files] + + +def test_leftovers(): + commentary = warcio.tester.Commentary('id', 'type') + assert not commentary.has_comments() + + # hard to test because invalid WARC Content-Length raises in archiveiterator + warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) + + # hard to test because warcio raises for unknown WARC version + warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) + + expected = '''\ +error: Must be an integer: Content-Length not-an-integer +''' + + assert '\n'.join(commentary.comments())+'\n' == expected diff --git a/test/test_tests.py b/test/test_tests.py deleted file mode 100644 index 200df8ae..00000000 --- a/test/test_tests.py +++ /dev/null @@ -1,348 +0,0 @@ -from warcio.cli import main -from warcio.utils import to_native_str -import warcio.tester - -from . import get_test_file -from .test_cli import patch_stdout - - -file_map = {} - - -def map_test_file(filename): - file_map[filename] = get_test_file(filename) - return file_map[filename] - - -def helper(args, expected_exit_value): - with patch_stdout() as buff: - exit_value = None - try: - main(args=args) - except SystemExit as e: - exit_value = e.code - finally: - assert exit_value == expected_exit_value - - return to_native_str(buff.getvalue()) - - -def remove_before_test_data(s): - ret = '' - for line in s.splitlines(True): - for filename, value in file_map.items(): - if value in line: - line = line.replace(value, 'test/data/' + filename) - ret += line - return ret - - -def test_torture_validate_record(): - files = ['standard-torture-validate-record.warc'] - files = [map_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/standard-torture-validate-record.warc - WARC-Record-ID None - WARC-Type warcinfo - digest not present - error: uri must be within <>: WARC-Refers-To probhibited - error: missing required header: WARC-Date - error: missing required header: WARC-Record-ID - error: field not allowed in record type: warcinfo WARC-Refers-To - error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte - comment: The first line of warc-fields cannot start with whitespace - comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n - comment: Missing colon in warc-fields line: no colon - comment: Invalid warc-fields name: token cannot have a space - WARC-Record-ID - WARC-Type warcinfo - digest not present - error: missing required header: WARC-Date - comment: warc-fields block present but empty - WARC-Record-ID - WARC-Type warcinfo - digest not present - error: missing required header: WARC-Date - recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields - WARC-Record-ID - WARC-Type response - digest not present - error: missing required header: WARC-Date - error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain - error: WARC-IP-Address should be used for http and https responses - error: http/https responses should have http headers - WARC-Record-ID - WARC-Type resource - digest not present - error: missing required header: WARC-Date - error: resource records for dns shall have Content-Type of text/dns: text/plain - WARC-Record-ID - WARC-Type resource - digest not present - error: missing required header: WARC-Date - comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block - WARC-Record-ID - WARC-Type resource - digest not present - error: missing required header: Content-Type - error: missing required header: WARC-Date - WARC-Record-ID - WARC-Type request - digest not present - error: missing required header: WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain - error: WARC-IP-Address should be used for http and https requests - WARC-Record-ID - WARC-Type request - digest not present - error: missing required header: WARC-Date - error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain - WARC-Record-ID - WARC-Type metadata - digest not present - error: missing required header: WARC-Date - comment: warc-fields block present but empty - WARC-Record-ID - WARC-Type metadata - digest not present - error: missing required header: WARC-Date - WARC-Record-ID - WARC-Type revisit - digest not present - error: missing required header: Content-Type - error: missing required header: WARC-Date - error: missing required header: WARC-Target-URI - comment: unknown value, perhaps an extension: WARC-Profile none - comment: no revisit details validation done due to unknown profile: none - WARC-Record-ID - WARC-Type revisit - digest not present - error: missing required header: Content-Type - error: missing required header: WARC-Date - error: missing required header: WARC-Target-URI - error: missing required header: WARC-Payload-Digest - recommendation: missing recommended header: WARC-Refers-To - recommendation: missing recommended header: WARC-Refers-To-Date - recommendation: missing recommended header: WARC-Refers-To-Target-URI - comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest - WARC-Record-ID - WARC-Type revisit - digest not present - error: missing required header: Content-Type - error: missing required header: WARC-Date - error: missing required header: WARC-Target-URI - recommendation: missing recommended header: WARC-Refers-To - recommendation: missing recommended header: WARC-Refers-To-Date - WARC-Record-ID - WARC-Type conversion - digest not present - error: missing required header: WARC-Date - error: missing required header: WARC-Target-URI - WARC-Record-ID - WARC-Type continuation - digest not present - error: missing required header: WARC-Date - error: missing required header: WARC-Segment-Origin-ID - error: missing required header: WARC-Target-URI - error: continuation record must have WARC-Segment-Number > 1: 1 - comment: warcio test continuation code has not been tested, expect bugs - WARC-Record-ID - WARC-Type continuation - digest not present - error: missing required header: WARC-Date - error: missing required header: WARC-Segment-Origin-ID - error: missing required header: WARC-Target-URI - comment: warcio test continuation code has not been tested, expect bugs -""" - - value = helper(args, 0) - print(remove_before_test_data(value)) - - actual = remove_before_test_data(value) - - assert actual == expected - - -def test_torture_validate_field(): - files = ['standard-torture-validate-field.warc'] - files = [map_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/standard-torture-validate-field.warc - WARC-Record-ID - WARC-Type does-not-exist - unknown hash algorithm name in block digest - error: uri must not be within <>: WARC-Target-URI - error: invalid uri scheme, bad character: WARC-Target-URI - error: duplicate field seen: WARC-Target-URI example.com - error: invalid uri, no scheme: WARC-Target-URI example.com - error: duplicate field seen: WARC-Target-URI ex ample.com - error: invalid uri, no scheme: WARC-Target-URI ex ample.com - error: invalid uri, contains whitespace: WARC-Target-URI ex ample.com - error: invalid uri scheme, bad character: WARC-Target-URI ex ample.com - error: duplicate field seen: WARC-Target-URI h<>ttp://example.com/ - error: invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/ - error: duplicate field seen: WARC-Type CAPITALIZED - error: uri must be within <>: WARC-Concurrent-To http://example.com/ - error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z - error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z - error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z - error: must contain a /: Content-Type asdf - error: invalid subtype: Content-Type asdf - error: duplicate field seen: Content-Type has space/asdf - error: invalid type: Content-Type has space/asdf - error: duplicate field seen: Content-Type asdf/has space - error: invalid subtype: Content-Type asdf/has space - error: duplicate field seen: Content-Type asdf/has space;asdf - error: invalid subtype: Content-Type asdf/has space;asdf - error: missing algorithm: WARC-Block-Digest asdf - error: duplicate field seen: WARC-Block-Digest has space:asdf - error: invalid algorithm: WARC-Block-Digest has space:asdf - error: duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^ - error: invalid ip: WARC-IP-Address 1.2.3.4.5 - error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf - error: duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest - error: must contain a /: WARC-Identified-Payload-Type asdf - error: invalid subtype: WARC-Identified-Payload-Type asdf - error: uri must be within <>: WARC-Segment-Origin-ID http://example.com - error: must be an integer: WARC-Segment-Number not-an-integer - error: duplicate field seen: WARC-Segment-Number 0 - error: must be 1 or greater: WARC-Segment-Number 0 - error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0 - error: duplicate field seen: WARC-Segment-Number 1 - error: duplicate field seen: WARC-Segment-Number 2 - error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2 - error: duplicate field seen: WARC-Segment-Total-Length not-an-integer - error: must be an integer: WARC-Segment-Total-Length not-an-integer - error: Invalid timestamp: WARC-Refers-To-Date not-a-date - comment: unknown WARC-Type: WARC-Type does-not-exist - comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED - comment: unknown WARC-Type: WARC-Type CAPITALIZED - comment: unknown digest algorithm: WARC-Block-Digest asdf - comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^ - comment: unknown value, perhaps an extension: WARC-Truncated invalid - comment: unknown value, perhaps an extension: WARC-Profile asdf - comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com - comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date - comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf - comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234 - comment: unknown field, no validation performed: WARC-Unknown-Field asdf - WARC-Record-ID None - WARC-Type invalid - digest not present - error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z - error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z - error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z - comment: unknown WARC-Type: WARC-Type invalid - WARC-Record-ID None - WARC-Type request - digest not present - error: segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID - error: missing required header: Content-Type - error: missing required header: WARC-Date - error: missing required header: WARC-Record-ID - error: missing required header: WARC-Target-URI - recommendation: do not segment WARC-Type request -global warcinfo checks - comment: WARC-Warcinfo-ID not found: WARC-Warcinfo-ID asdf:asdf -global Concurrent-To checks - comment: WARC-Concurrent-To not found: WARC-Concurrent-To - comment: WARC-Concurrent-To not found: WARC-Concurrent-To http://example.com/ -""" - - value = helper(args, 0) - actual = remove_before_test_data(value) - - print(actual) - assert actual == expected - - -def test_arc(): - files = ['does-not-exist.arc'] - files = [map_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/does-not-exist.arc -""" - - value = helper(args, 0) - assert remove_before_test_data(value) == expected - - -def test_digests(): - # needed for test coverage - files = ['example-digest-bad.warc', 'example.warc'] - files = [map_test_file(filename) for filename in files] - - args = ['test'] - args.extend(files) - - expected = """\ -test/data/example-digest-bad.warc - WARC-Record-ID - WARC-Type request - payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - error: WARC-IP-Address should be used for http and https requests - WARC-Record-ID - WARC-Type request - digest pass - error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: - WARC-Record-ID - WARC-Type request - digest pass - error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: - WARC-Record-ID - WARC-Type request - digest pass - error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: -test/data/example.warc - WARC-Record-ID - WARC-Type request - digest not present - error: WARC-IP-Address should be used for http and https requests - error: Duplicate WARC-Record-ID: found in files test/data/example.warc test/data/example-digest-bad.warc - WARC-Record-ID - WARC-Type revisit - digest present but not checked (revisit) - recommendation: Missing recommended header: WARC-Refers-To - comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest - comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/ - comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z - WARC-Record-ID - WARC-Type request - digest not present - error: WARC-IP-Address should be used for http and https requests -""" - - value = helper(args, 0) - assert remove_before_test_data(value) == expected - - -def test_leftovers(): - commentary = warcio.tester.Commentary('id', 'type') - assert not commentary.has_comments() - - # hard to test because invalid WARC Content-Length raises in archiveiterator - warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None) - - # hard to test because warcio raises for unknown WARC version - warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None) - - expected = '''\ -error: Must be an integer: Content-Length not-an-integer -''' - - assert '\n'.join(commentary.comments())+'\n' == expected From b61878e28058ba3b1aa5a6363717f781dd4b0995 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 4 Apr 2019 15:01:31 -0700 Subject: [PATCH 66/68] wip --- test/test_tester.py | 2 +- warcio/archiveiterator.py | 5 ++- warcio/bufferedreaders.py | 49 +++++++++++++------------ warcio/recordloader.py | 43 ++++++++++++++++++---- warcio/tester.py | 76 +++++++++------------------------------ 5 files changed, 82 insertions(+), 93 deletions(-) diff --git a/test/test_tester.py b/test/test_tester.py index 49b1cc6d..08963ea9 100644 --- a/test/test_tester.py +++ b/test/test_tester.py @@ -80,7 +80,7 @@ def test_digests(): def test_leftovers(): - commentary = warcio.tester.Commentary('id', 'type') + commentary = warcio.recordloader.Commentary() assert not commentary.has_comments() # hard to test because invalid WARC Content-Length raises in archiveiterator diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py index 5e9c02ca..24094936 100644 --- a/warcio/archiveiterator.py +++ b/warcio/archiveiterator.py @@ -56,14 +56,13 @@ class ArchiveIterator(six.Iterator): def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE, - check_digests=False, fixup_bugs=True, raise_exceptions=False): + check_digests=False, fixup_bugs=True): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc, - fixup_bugs=fixup_bugs, - raise_exceptions=raise_exceptions) + fixup_bugs=fixup_bugs) self.known_format = None self.mixed_arc_warc = arc2warc diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 74adae51..f60ae1a5 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -36,12 +36,6 @@ def brotli_decompressor(): pass -#================================================================= -class DecompressionException(Exception): - def __init__(self, msg): - Exception.__init__(self, msg) - - #================================================================= class BufferedReader(object): """ @@ -71,7 +65,7 @@ def __init__(self, stream, block_size=BUFF_SIZE, decomp_type=None, starting_data=None, read_all_members=False, - raise_exceptions=False): + commentary=None): self.stream = stream self.block_size = block_size @@ -84,7 +78,7 @@ def __init__(self, stream, block_size=BUFF_SIZE, self.buff_size = 0 self.read_all_members = read_all_members - self.raise_exceptions = raise_exceptions + self.commentary = commentary def set_decomp(self, decomp_type): self._init_decomp(decomp_type) @@ -96,6 +90,10 @@ def _init_decomp(self, decomp_type): self.decomp_type = decomp_type self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]() except KeyError: + # XXX don't raise? + # we don't know if the enduser cares or not + # or the record might actually be uncompressed + # XXX what does pywb do raise Exception('Decompression type not supported: ' + decomp_type) else: @@ -150,8 +148,8 @@ def _decompress(self, data): self._init_decomp('deflate_alt') data = self._decompress(data) else: - if self.raise_exceptions: - raise DecompressionException(str(e)) + if self.commentary: + self.commentary.comment('Payload claimed to be compressed but apparently is not') self.decompressor = None # otherwise (partly decompressed), something is wrong else: @@ -290,40 +288,43 @@ class ChunkedDataReader(BufferedReader): If at any point the chunked header is not available, the stream is assumed to not be chunked and no more dechunking occurs. """ - def __init__(self, stream, **kwargs): + def __init__(self, stream, raise_exceptions=False, commentary=None, **kwargs): super(ChunkedDataReader, self).__init__(stream, **kwargs) self.all_chunks_read = False - self.not_chunked = False - - # if False, we'll use best-guess fallback for parse errors - self.raise_chunked_data_exceptions = kwargs.get('raise_exceptions') + self.not_actually_chunked = False + self.at_start = True + self.raise_chunked_data_exceptions = raise_exceptions + self.commentary = commentary def _fillbuff(self, block_size=None): - if self.not_chunked: + if self.not_actually_chunked: return super(ChunkedDataReader, self)._fillbuff(block_size) # Loop over chunks until there is some data (not empty()) # In particular, gzipped data may require multiple chunks to # return any decompressed result - while (self.empty() and - not self.all_chunks_read and - not self.not_chunked): - + while (self.empty() and not self.all_chunks_read): try: length_header = self.stream.readline(64) self._try_decode(length_header) + self.at_start = False except ChunkedDataException as e: if self.raise_chunked_data_exceptions: raise - # Can't parse the data as chunked. # It's possible that non-chunked data is served # with a Transfer-Encoding: chunked. # Treat this as non-chunk encoded from here on. + if self.commentary: + if self.at_start: + self.commentary.comment('Buffer claimed to be chunked, but was not from the start') + else: + self.commentary.comment('Buffer is chunked but there was an unchunking error midway') self._process_read(length_header + e.data) - self.not_chunked = True + self.not_actually_chunked = True + self.at_start = False - # parse as block as non-chunked + # parse as non-chunked return super(ChunkedDataReader, self)._fillbuff(block_size) def _try_decode(self, length_header): @@ -365,6 +366,8 @@ def _try_decode(self, length_header): msg = 'Ran out of data before end of chunk' raise ChunkedDataException(msg, data) else: + if self.commentary: + self.commentary.comment('Chunked reader ran out of data before end of chunk') chunk_size = data_len self.all_chunks_read = True diff --git a/warcio/recordloader.py b/warcio/recordloader.py index d5523f75..729cc3c6 100644 --- a/warcio/recordloader.py +++ b/warcio/recordloader.py @@ -16,6 +16,36 @@ logger = logging.getLogger(__name__) +#================================================================= +class Commentary(object): + def __init__(self): + self.errors = [] + self.recommendations = [] + self._comments = [] + + def error(self, *args): + self.errors.append(args) + + def recommendation(self, *args): + self.recommendations.append(args) + + def comment(self, *args): + self._comments.append(args) + + def has_comments(self): + if self.errors or self.recommendations or self._comments: + return True + + def comments(self): + # XXX str() all of these, in case an int or other thing slips in? + for e in self.errors: + yield 'error: ' + ' '.join(e) + for r in self.recommendations: + yield 'recommendation: ' + ' '.join(r) + for c in self._comments: + yield 'comment: ' + ' '.join(c) + + #================================================================= class ArcWarcRecord(object): def __init__(self, *args, **kwargs): @@ -23,7 +53,7 @@ def __init__(self, *args, **kwargs): self.http_headers, self.content_type, self.length) = args self.payload_length = -1 self.digest_checker = kwargs.get('digest_checker') - self.raise_exceptions = kwargs.get('raise_exceptions') + self.commentary = kwargs.get('commentary') self._content_stream = None def content_stream(self): @@ -42,9 +72,9 @@ def content_stream(self): encoding = None if self.http_headers.get_header('transfer-encoding') == 'chunked': - self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) + self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary) elif encoding: - self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions) + self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary) else: self._content_stream = self.raw_stream @@ -65,7 +95,7 @@ class ArcWarcRecordLoader(object): NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:') HTTP_SCHEMES = ('http:', 'https:') - def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_exceptions=False): + def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True): if arc2warc: self.arc_parser = ARC2WARCHeadersParser() else: @@ -76,7 +106,6 @@ def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_excep self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http) self.fixup_bugs = fixup_bugs - self.raise_exceptions = raise_exceptions def parse_record_stream(self, stream, statusline=None, @@ -134,6 +163,7 @@ def parse_record_stream(self, stream, is_verifying = False digest_checker = DigestChecker(check_digests) + commentary = Commentary() # limit stream to the length for all valid records if length is not None and length >= 0: @@ -158,7 +188,8 @@ def parse_record_stream(self, stream, return ArcWarcRecord(the_format, rec_type, rec_headers, stream, http_headers, - content_type, length, digest_checker=digest_checker, raise_exceptions=self.raise_exceptions) + content_type, length, digest_checker=digest_checker, + commentary=commentary) def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None): payload_digest = rec_headers.get_header('WARC-Payload-Digest') diff --git a/warcio/tester.py b/warcio/tester.py index 84ea75c3..cee5344f 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -8,45 +8,8 @@ from warcio.archiveiterator import WARCIterator from warcio.utils import to_native_str, Digester from warcio.exceptions import ArchiveLoadFailed -from warcio.bufferedreaders import ChunkedDataException, DecompressionException - - -class Commentary(object): - def __init__(self, record_id=None, rec_type=None): - self._record_id = record_id - self._rec_type = rec_type - self.errors = [] - self.recommendations = [] - self._comments = [] - - def record_id(self): - return self._record_id - - def rec_type(self): - return self._rec_type - - def error(self, *args): - self.errors.append(args) - - def recommendation(self, *args): - self.recommendations.append(args) - - def comment(self, *args): - self._comments.append(args) - - def has_comments(self): - if self.errors or self.recommendations or self._comments: - return True - - def comments(self): - # XXX str() all of these, in case an int or other thing slips in? - for e in self.errors: - yield 'error: ' + ' '.join(e) - for r in self.recommendations: - yield 'recommendation: ' + ' '.join(r) - for c in self._comments: - yield 'comment: ' + ' '.join(c) - +from warcio.bufferedreaders import ChunkedDataException +from warcio.recordloader import Commentary class WrapRecord(object): def __init__(self, obj): @@ -662,9 +625,7 @@ def validate_record_against_rec_type(config, record, commentary, pending): def validate_record(record): version = record.rec_headers.protocol.split('/', 1)[1] # XXX not exported - record_id = record.rec_headers.get_header('WARC-Record-ID') - rec_type = record.rec_headers.get_header('WARC-Type') - commentary = Commentary(record_id=record_id, rec_type=rec_type) + commentary = record.commentary pending = None seen_fields = set() @@ -683,6 +644,7 @@ def validate_record(record): if 'validate' in config: config['validate'](field, value, record, version, commentary, pending) + rec_type = record.rec_headers.get_header('WARC-Type') if rec_type not in record_types: # we print a comment for this elsewhere pass @@ -839,37 +801,31 @@ def _process_one(warcfile, all_records, concurrent_to, verbose): if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'): return with open(warcfile, 'rb') as stream: - for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True): - #for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): - + for record in WARCIterator(stream, check_digests=True, fixup_bugs=False): record = WrapRecord(record) digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or record.rec_headers.get_header('WARC-Block-Digest')) + record_id = record.rec_headers.get_header('WARC-Record-ID') + rec_type = record.rec_headers.get_header('WARC-Type') - commentary = validate_record(record) - save_global_info(record, warcfile, commentary, all_records, concurrent_to) + validate_record(record) + record.stream_for_digest_check() - try: - record.stream_for_digest_check() - except ChunkedDataException as e: - commentary.comment('Transfer-Encoding: chunked, saw exception: '+str(e)) - pass - except DecompressionException as e: - commentary.comment('Content-Encoding indicates compression, saw: '+str(e)) - pass + commentary = record.commentary + save_global_info(record, warcfile, commentary, all_records, concurrent_to) if verbose or commentary.has_comments() or record.digest_checker.passed is False: - print(' ', 'WARC-Record-ID', commentary.record_id()) - print(' ', 'WARC-Type', commentary.rec_type()) - + print(' ', 'WARC-Record-ID', record_id) + print(' ', 'WARC-Type', rec_type) if record.digest_checker.passed is True: print(' digest pass') elif record.digest_checker.passed is None: if digest_present: - if commentary.rec_type() == 'revisit': + if rec_type == 'revisit': print(' digest present but not checked (revisit)') else: # pragma: no cover - # WARC record missing Content-Length: header, which is verboten + # should not ever happen + # example reason: WARC record missing Content-Length: header, but that case raises print(' digest present but not checked') else: print(' digest not present') From 2d2b7d560f4bccf694faf1a5eb7b36c020cbfdd6 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Mon, 9 Sep 2019 11:03:30 -0700 Subject: [PATCH 67/68] tests pass --- test/test_check_digest_examples.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_check_digest_examples.py b/test/test_check_digest_examples.py index 679d7d24..89eb296f 100644 --- a/test/test_check_digest_examples.py +++ b/test/test_check_digest_examples.py @@ -9,7 +9,8 @@ 'example-iana.org-chunked.warc', 'example-wrong-chunks.warc.gz', 'example-bad-non-chunked.warc.gz', - 'example-digest.warc' + 'example-digest-bad.warc', + 'standard-torture-validate-field.warc', ] @@ -34,7 +35,7 @@ def check_helper(self, args, expected_exit_value, capsys): return capsys.readouterr()[0] # list for py33 support def test_check_invalid(self, capsys): - filenames = [get_test_file('example-digest.warc')] + filenames = [get_test_file('example-digest-bad.warc')] args = ['check'] + filenames value = self.check_helper(args, 1, capsys) From fc19c7d632e9440d88e6858db642686673506f20 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 16 Feb 2020 13:49:08 -0800 Subject: [PATCH 68/68] comments --- warcio/tester.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/warcio/tester.py b/warcio/tester.py index cee5344f..2fc8ff9b 100644 --- a/warcio/tester.py +++ b/warcio/tester.py @@ -93,7 +93,10 @@ def validate_warc_fields(record, commentary): commentary.comment('warc-fields block present but empty') return - # check known fields + # XXX check known fields + # warcinfo "but not limited to" + # metadata lacks that langauge + # https://github.com/iipc/warc-specifications/issues/7 def validate_warcinfo(record, commentary, pending): @@ -110,7 +113,7 @@ def validate_warcinfo(record, commentary, pending): # comment if http-header-from here and in the request? validate_warc_fields(record, commentary) - # whole-file tests: + # XXX whole-file tests: # recommended that all files start with warcinfo # elsewise allowable for warcinfo to appear anywhere @@ -152,6 +155,7 @@ def validate_response(record, commentary, pending): if int(http_content_length) != record.raw_stream.stream.limit: commentary.comment('Actual http payload length is different from http header Content-Length:', str(record.raw_stream.stream.limit), http_content_length) + # XXX can we say something useful if we are unable to check this length? why would it fail? def validate_resource(record, commentary, pending):