From ff1f543a078415ae042140b8a98c51d3e2d0a5f5 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 25 Jan 2019 16:05:38 -0800
Subject: [PATCH 01/68] warcio test

---
 ...le-digest.warc => example-digest-bad.warc} |   0
 test/test_archiveiterator.py                  |  10 +-
 warcio/archiveiterator.py                     |   5 +-
 warcio/cli.py                                 |  11 +
 warcio/recordloader.py                        |   9 +-
 warcio/tester.py                              | 638 ++++++++++++++++++
 6 files changed, 663 insertions(+), 10 deletions(-)
 rename test/data/{example-digest.warc => example-digest-bad.warc} (100%)
 create mode 100644 warcio/tester.py

diff --git a/test/data/example-digest.warc b/test/data/example-digest-bad.warc
similarity index 100%
rename from test/data/example-digest.warc
rename to test/data/example-digest-bad.warc
diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py
index 8cba7600..810ba73b 100644
--- a/test/test_archiveiterator.py
+++ b/test/test_archiveiterator.py
@@ -185,6 +185,8 @@ def test_err_arc_iterator_on_warc(self):
     def test_corrects_wget_bug(self):
         with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record:
             assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
+        with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record:
+            assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
 
     def _digests_mutilate_helper(self, contents, expected_t, expected_f, capsys, full_read=False):
         with pytest.raises(ArchiveLoadFailed):
@@ -243,9 +245,9 @@ def test_digests_file(self):
         expected_t = ['request', 'request', 'request']
 
         # record 1: invalid payload digest
-        assert self._load_archive('example-digest.warc', check_digests=True) == expected_t
-        assert self._load_archive('example-digest.warc', check_digests=False) == expected_f
+        assert self._load_archive('example-digest-bad.warc', check_digests=True) == expected_t
+        assert self._load_archive('example-digest-bad.warc', check_digests=False) == expected_f
 
         # record 2: b64 digest; record 3: b64 filename safe digest
-        assert self._load_archive('example-digest.warc', offset=922, check_digests=True) == expected_t
-        assert self._load_archive('example-digest.warc', offset=922, check_digests=False) == expected_t
+        assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=True) == expected_t
+        assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=False) == expected_t
diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py
index 8f6a1b55..0d1fe2dd 100644
--- a/warcio/archiveiterator.py
+++ b/warcio/archiveiterator.py
@@ -43,12 +43,13 @@ class ArchiveIterator(six.Iterator):
     def __init__(self, fileobj, no_record_parse=False,
                  verify_http=False, arc2warc=False,
                  ensure_http_headers=False, block_size=BUFF_SIZE,
-                 check_digests=False):
+                 check_digests=False, fixup_bugs=True):
 
         self.fh = fileobj
 
         self.loader = ArcWarcRecordLoader(verify_http=verify_http,
-                                          arc2warc=arc2warc)
+                                          arc2warc=arc2warc,
+                                          fixup_bugs=fixup_bugs)
         self.known_format = None
 
         self.mixed_arc_warc = arc2warc
diff --git a/warcio/cli.py b/warcio/cli.py
index 85c4a750..7e40cdad 100644
--- a/warcio/cli.py
+++ b/warcio/cli.py
@@ -8,6 +8,7 @@
 
 from warcio.indexer import Indexer
 from warcio.checker import Checker
+from warcio.tester import Tester
 from warcio.utils import BUFF_SIZE
 
 import tempfile
@@ -54,6 +55,10 @@ def main(args=None):
     check.add_argument('-v', '--verbose', action='store_true')
     check.set_defaults(func=checker)
 
+    test = subparsers.add_parser('test', help='WARC standards tester')
+    test.add_argument('inputs', nargs='+')
+    test.set_defaults(func=tester)
+
     cmd = parser.parse_args(args=args)
     cmd.func(cmd)
 
@@ -106,6 +111,12 @@ def checker(cmd):
     sys.exit(_checker.process_all())
 
 
+# ============================================================================
+def tester(cmd):
+    _tester = Tester(cmd)
+    sys.exit(_tester.process_all())
+
+
 # ============================================================================
 class Recompressor(object):
     def __call__(self, cmd):
diff --git a/warcio/recordloader.py b/warcio/recordloader.py
index 2467bde3..1f17d1f0 100644
--- a/warcio/recordloader.py
+++ b/warcio/recordloader.py
@@ -55,7 +55,7 @@ class ArcWarcRecordLoader(object):
     NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
     HTTP_SCHEMES = ('http:', 'https:')
 
-    def __init__(self, verify_http=True, arc2warc=True):
+    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True):
         if arc2warc:
             self.arc_parser = ARC2WARCHeadersParser()
         else:
@@ -65,6 +65,7 @@ def __init__(self, verify_http=True, arc2warc=True):
         self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
 
         self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
+        self.fixup_bugs = fixup_bugs
 
     def parse_record_stream(self, stream,
                             statusline=None,
@@ -96,7 +97,7 @@ def parse_record_stream(self, stream,
 
         elif the_format in ('warc', 'arc2warc'):
             rec_type = rec_headers.get_header('WARC-Type')
-            uri = self._ensure_target_uri_format(rec_headers)
+            uri = self._ensure_target_uri_format(rec_headers, fixup_bugs=self.fixup_bugs)
             length = rec_headers.get_header('Content-Length')
             content_type = rec_headers.get_header('Content-Type')
             if the_format == 'warc':
@@ -235,7 +236,7 @@ def _detect_type_load_headers(self, stream,
                 msg = 'Unknown archive format, first line: '
             raise ArchiveLoadFailed(msg + str(se.statusline))
 
-    def _ensure_target_uri_format(self, rec_headers):
+    def _ensure_target_uri_format(self, rec_headers, fixup_bugs=True):
         """Checks the value for the WARC-Target-URI header field to see if it starts
         with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present,
         corrects and updates the field returning the corrected value for the field
@@ -246,7 +247,7 @@ def _ensure_target_uri_format(self, rec_headers):
         :rtype: str | None
         """
         uri = rec_headers.get_header('WARC-Target-URI')
-        if uri is not None and uri.startswith('<') and uri.endswith('>'):
+        if fixup_bugs and uri is not None and uri.startswith('<') and uri.endswith('>'):
             uri = uri[1:-1]
             rec_headers.replace_header('WARC-Target-URI', uri)
         return uri
diff --git a/warcio/tester.py b/warcio/tester.py
new file mode 100644
index 00000000..800f797e
--- /dev/null
+++ b/warcio/tester.py
@@ -0,0 +1,638 @@
+from __future__ import print_function
+
+import re
+import ipaddress
+import sys
+import traceback
+
+from warcio.archiveiterator import WARCIterator
+
+
+class Commentary:
+    def __init__(self, record_id, rec_type):
+        self._record_id = record_id
+        self._rec_type = rec_type
+        self.errors = []
+        self.recommendations = []
+        self._comments = []
+
+    def record_id(self):
+        return self._record_id
+
+    def rec_type(self):
+        return self._rec_type
+
+    def error(self, *args):
+        self.errors.append(args)
+
+    def recommendation(self, *args):
+        self.recommendations.append(args)
+
+    def comment(self, *args):
+        self._comments.append(args)
+
+    def has_comments(self):
+        if self.errors or self.recommendations or self._comments:
+            return True
+
+    def comments(self):
+        for e in self.errors:
+            yield 'error: ' + ' '.join(e)
+        for r in self.recommendations:
+            yield 'recommendation: ' + ' '.join(r)
+        for c in self._comments:
+            yield 'comment: ' + ' '.join(c)
+
+
+class WrapRecord(object):
+    def __init__(self, obj):
+        self.obj = obj
+        self._content = None
+
+    def __getattr__(self, name):
+        if name == 'content':
+            if self._content is None:
+                self._content = self.obj.content_stream().read()
+            return self._content
+        return getattr(self.__dict__['obj'], name)
+
+
+def canon_content_type(s):
+    return s.lower().replace('; ', ';')
+
+
+def validate_warc_fields(record, commentary):
+    # warc-fields = *named-field CRLF
+    # named-field = field-name ":" [ field-value ]
+    # field-value = *( field-content | LWS )  # LWS signals continuations
+    # field-name = token  # token_re
+
+    content = record.content
+    try:
+        text = content.decode('utf-8', errors='strict')
+    except UnicodeDecodeError as e:
+        commentary.error('warc-fields contains invalid utf-8: '+str(e))
+        text = content.decode('utf-8', errors='replace')
+
+    first_line = True
+    lines = []
+    for line in text.splitlines(True):
+        if not line.endswith('\r\n'):
+            commentary.error('warc-fields lines must end with \r\n')
+            line = line.rstrip('\r\n')
+        else:
+            line = line[:-2]
+
+        if line.startswith(' ') or line.startswith('\t'):
+            if first_line:
+                commentary.error('The first line of warc-fields cannot start with whitespace')
+            else:
+                lines[-1] += ' ' + line[1:]
+        elif line == '':
+            # are blank lines prohibited?
+            pass
+        else:
+            # check for field-name :
+            if ':' not in line:
+                commentary.error('Missing field-name : in warc-fields line', line)
+            else:
+                field_name = line.split(':', 1)[0]
+                if not re.fullmatch(token_re, field_name):
+                    commentary('invalid warc-fields name', field_name)
+                else:
+                    lines.append(line)
+        first_line = False
+
+    # check known fields
+
+
+def validate_warcinfo(record, commentary, pending):
+    content_type = record.rec_headers.get_header('Content-Type')
+    if content_type.lower() != 'application/warc-fields':
+        commentary.recommencation('warcinfo Content-Type of application/warc-fields, saw', content_type)
+    else:
+        #   format: warc-fields
+        #   allowable fields include but not limited to DMCI plus the following
+        #   operator, software, robots, hostname, ip, http-header-user-agent, http-header-from
+        #     if operator present, recommended name or name and email address
+        #     comment if http-user-agent here and in the request or metadata record?
+        #     comment if http-header-from here and in the request?
+        validate_warc_fields(record, commentary)
+
+    # whole-file tests:
+    # optional that warcinfo be first in file, still deserves a comment
+    # allowable for warcinfo to appear anywhere
+
+
+def validate_response(record, commentary, pending):
+    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+
+    if target_uri.startswith('http:') or target_uri.startswith('https:'):
+        content_type = record.rec_headers.get_header('Content-Type')
+        if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}:
+            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw ', content_type)
+
+        if record.rec_headers.get_header('WARC-IP-Address') is None:
+            commentary.error('WARC-IP-Address should be used for http and https responses')
+
+        # error: http and https schemes should have http response headers
+        # comment: verify http content-length, if present -- commoncrawl nutch bug
+
+
+def validate_resource(record, commentary, pending):
+    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+
+    if target_uri.startswith('dns:'):
+        content_type = record.rec_headers.get_header('Content-Type')
+        if content_type.lower() != 'text/dns':
+            commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type)
+        else:
+            # rfc 2540 and rfc 1035
+            #validate_text_dns()
+            pass
+
+    # should never have http headers
+
+
+def validate_request(record, commentary, pending):
+    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+
+    if target_uri.startswith('http:') or target_uri.startswith('https:'):
+        content_type = record.rec_headers.get_header('Content-Type')
+
+        if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}:
+            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw ', content_type)
+
+        if record.rec_headers.get_header('WARC-IP-Address') is None:
+            commentary.error('WARC-IP-Address should be used for http and https requests')
+
+        # error: http and https schemes should have http request headers
+
+        # WARC-Concurrent-To field or fields may be used, comment if present but target record is not
+
+
+def validate_metadata(record, commentary, pending):
+    content_type = record.rec_headers.get_header('Content-Type')
+    if content_type.lower() == 'application/warc-fields':
+        # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6
+        # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it
+        # hopsFromSeed: string
+        # fetchTimeMs: time in milliseconds, so it's an integer?
+        validate_warc_fields(record, commentary)
+
+
+def validate_revisit(record, commentary, pending):
+    warc_profile = record.rec_headers.get_header('WARC-Profile')
+
+    if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'):
+        config = {
+            'required': ['WARC-Payload-Digest'],
+            'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],
+        }
+        validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
+        # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired
+        # recommended that server response headers be preserved "in this manner"
+
+    elif warc_profile.ends_with('/revisit/server-not-modified'):
+        config = {
+            'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'],
+            'prohibited': ['WARC-Payload-Digest'],
+        }
+        validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
+        #   may have content body; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated if desired
+        #   WARC-Refers-To-Date should be the same as WARC-Date in the original record if present
+    else:
+        commentary.comment('no revisit details validation done due to unknown profile')
+
+
+def validate_conversion(record, commentary, pending):
+    # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment?
+    # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To
+    pass
+
+
+def validate_continuation(record, commentary, pending):
+    commentary.comment('warcio test continuation code has not been tested, expect bugs')
+
+    warc_type = record.rec_headers.get_header('WARC-Type')
+    if warc_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
+        commentary.recommendation('do not segment warc-type', warc_type)
+
+    # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated
+
+
+def validate_actual_uri(field, value, record, version, commentary, pending):
+    # uri per RFC 3986
+    # should use a registered scheme
+    # %XX encoding, normalize to upper case
+    # schemes are case-insensitive and normalize to lower
+    if value.startswith('<') or value.endswith('>'):
+        # wget 1.19 bug caused by WARC 1.0 spec error
+        commentary.error('uri must not be within <>', field, value)
+    if ':' not in value:
+        commentary.error('invalid uri, no scheme', field, value)
+    if re.search(r'\s', value, re.A):
+        commentary.error('invalid uri, contains whitespace', field, value)
+    scheme, rest = value.split(':', 1)
+    if not re.fullmatch(r'[A-Za-z][A-Za-z0-9+\-\.]*', scheme, re.A):
+        commentary.error('invalid uri scheme, bad character', field, value)
+    # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+
+
+def validate_warc_type(field, value, record, version, commentary, pending):
+    if not value.islower():
+        # I am unclear if this is allowed? standard is silent
+        commentary.comment('Warc-Type is not lower-case', field, value)
+    if value.lower() not in record_types:
+        # standard says readers should ignore unknown warc-types
+        commentary.comment('unknown Warc-Type', field, value)
+
+
+def validate_uri(field, value, record, version, commentary, pending):
+    # < uri >
+    if not (value.startswith('<') and value.endswith('>')):
+        commentary.error('uri must be within <>', field, value)
+        return
+    validate_actual_uri(field, value[1:-1], record, version, commentary, pending)
+
+
+def validate_record_id(field, value, record, version, commentary, pending):
+    validate_uri(field, value, record, version, commentary, pending)
+    # TODO: should be "globally unique for its period of intended use"
+
+
+def validate_timestamp(field, value, record, version, commentary, pending):
+    use_ms = False if version == '1.0' else True
+    if not use_ms:
+        if '.' in value:
+            # XXX specification infelicity: would be nice to have 'advice to implementers' here
+            commentary.error('WARC 1.0 may not have fractional seconds', field, value)
+    else:
+        start, end = value.split('.', 1)
+        if not re.fullmatch(r'[0-9]{1,9}Z', end, re.A):
+            commentary.error('fractional seconds must have 1-9 digits', field, value)
+
+    # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
+
+    # TODO: "multiple records written as part of a single capture event shall use the same WARC-Date"
+    # how? follow WARC-Concurrent-To pointer(s) from request to response(s)
+
+
+def validate_content_length(field, value, record, version, commentary, pending):
+    if not value.isdigit():
+        commentary.error('must be an integer', field, value)
+
+
+token_re = r'[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+'
+digest_re = r'[A-Za-z0-9/+\-_=]+'
+
+
+def validate_content_type(field, value, record, version, commentary, pending):
+    if '/' not in value:
+        commentary.error('must contain a /', field, value)
+    ctype, rest = value.split('/', 1)
+    if not re.fullmatch(token_re, ctype, re.A):
+        commentary.error('invalid type', field, value)
+    if ';' in rest:
+        subtype, rest = rest.split(';', 1)
+    else:
+        subtype = rest
+    if not re.fullmatch(token_re, subtype, re.A):
+        commentary.error('invalid subtype', field, value)
+    # at this point there can be multiple parameters,
+    # some of which could have quoted string values with ; in them
+    # TODO: more checking
+
+
+def validate_digest(field, value, record, version, commentary, pending):
+    if ':' not in value:
+        commentary.error('missing algorithm', field, value)
+    algorithm, digest = value.split(':', 1)
+    if not re.fullmatch(token_re, algorithm, re.A):
+        commentary.error('invalid algorithm', field, value)
+    if not re.fullmatch(token_re, digest, re.A):
+        # https://github.com/iipc/warc-specifications/issues/48
+        # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
+        pass
+    if not re.fullmatch(digest_re, digest, re.A):
+        commentary.comment('Invalid-looking digest value', field, value)
+
+
+def validate_ip(field, value, record, version, commentary, pending):
+    # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291
+    try:
+        ipaddress.ip_address(value)
+    except ValueError:
+        commentary.error('invalid ip', field, value)
+
+
+def validate_truncated(field, value, record, version, commentary, pending):
+    if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}:
+        commentary.comment('extension seen', field, value)
+
+
+def validate_warcinfo_id(field, value, record, version, commentary, pending):
+    validate_uri(field, value, record, version, commentary, pending)
+    # TODO: should point at a warcinfo record
+
+
+def validate_filename(field, value, record, version, commentary, pending):
+    # TODO: text or quoted-string
+    pass
+
+
+profiles = {
+    '1.0': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
+            'http://netpreserve.org/warc/1.1/revisit/server-not-modified',
+            # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not?
+            'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'],
+    '1.1': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest',
+            'http://netpreserve.org/warc/1.0/revisit/server-not-modified'],
+}
+
+
+def validate_profile(field, value, record, version, commentary, pending):
+    if version not in profiles:
+        commentary.comment('no profile check because unknown warc version', field, value)
+        return
+    if value not in profiles[version]:
+        commentary.comment('extension seen', field, value)
+
+
+def validate_segment_number(field, value, record, version, commentary, pending):
+    if not value.isdigit():
+        commentary.error('must be an integer', field, value)
+    iv = int(value)
+    if iv == 0:
+        commentary.error('must be 1 or greater', field, value)
+    # TODO: type != continuation must have iv == 1, else iv > 1
+    # might make that check in the 'continuation' section?
+
+
+def validate_segment_total_length(field, value, record, version, commentary, pending):
+    if not value.isdigit():
+        commentary.error('must be an integer', field, value)
+
+
+warc_fields = {
+    'WARC-Type': {
+        'validate': validate_warc_type,
+    },
+    'WARC-Record-ID': {
+        'validate': validate_record_id,
+    },
+    'WARC-Date': {
+        'validate': validate_timestamp,
+    },
+    'Content-Length': {
+        'validate': validate_content_length,
+    },
+    'Content-Type': {
+        'validate': validate_content_type,
+    },
+    'WARC-Concurrent-To': {
+        'validate': validate_uri,
+    },
+    'WARC-Block-Digest': {
+        'validate': validate_digest,  # openssl check? or just let check_digest get it?
+    },
+    'WARC-Payload-Digest': {
+        'validate': validate_digest,
+    },
+    'WARC-IP-Address': {
+        'validate': validate_ip,
+    },
+    'WARC-Refers-To': {
+        'validate': validate_uri,
+    },
+    'WARC-Target-URI': {
+        'validate': validate_actual_uri,
+    },
+    'WARC-Truncated': {
+        'validate': validate_truncated,
+    },
+    'WARC-Warcinfo-ID': {
+        'validate': validate_warcinfo_id,
+    },
+    'WARC-Filename': {
+        'validate': validate_filename,
+    },
+    'WARC-Profile': {
+        'validate': validate_profile,
+    },
+    'WARC-Identified-Payload-Type': {
+        'validate': validate_content_type,
+    },
+    'WARC-Segment-Origin-ID': {
+        'validate': validate_uri,
+    },
+    'WARC-Segment-Number': {
+        'validate': validate_segment_number,
+    },
+    'WARC-Segment-Total-Length': {
+        'validate': validate_segment_total_length,
+    },
+    'WARC-Refers-To-Target-URI': {
+        'validate': validate_actual_uri,
+        'minver': '1.1',
+    },
+    'WARC-Refers-To-Date': {
+        'validate': validate_timestamp,
+        'minver': '1.1',
+    },
+}
+warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()])
+
+record_types = {
+    'warcinfo': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'],
+        'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'],
+        'validate': validate_warcinfo,
+    },
+    'response': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'Content-Type', 'WARC-Target-URI'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
+        'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_response,
+    },
+    'resource': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'],
+        'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+    },
+    'request': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'Content-Type', 'WARC-Target-URI'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
+        'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_request,
+    },
+    'metadata': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'Content-Type'],
+        'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated',
+                     'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'],
+        'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_metadata,
+    },
+    'revisit': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'Content-Type', 'WARC-Target-URI', 'WARC-Profile'],
+        'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID',  # normal optionals
+                     'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],  # these are for profiles
+        'prohibited': ['WARC-Filename'],
+        'validate': validate_revisit,
+    },
+    'conversion': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID'],
+        'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_conversion,
+    },
+    'continuation': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'],
+        'optional': [],
+        'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_continuation,
+    },
+}
+
+
+def make_header_set(config, kinds):
+    ret = set()
+    for kind in kinds:
+        ret = ret.union(set([x.lower() for x in config.get(kind, [])]))
+    return ret
+
+
+def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False):
+    for req in config.get('required', []):
+        if not rec_headers.get_header(req):
+            commentary.error('missing required header', req)
+    for rec in config.get('recommended', []):
+        if not rec_headers.get_header(rec):
+            commentary.recommendation('missing recommended header', rec)
+    allowed = make_header_set(config, ('required', 'optional', 'recommended'))
+    prohibited = make_header_set(config, ('prohibited',))
+
+    for field, value in rec_headers.headers:
+        fl = field.lower()
+        if fl in prohibited:
+            commentary.error('field not allowed in record_type', field, rec_type)
+        elif allow_all or fl in allowed:
+            pass
+        elif fl in warc_fields:
+            commentary.comment('no configuration seen for', field, rec_type)
+        else:
+            # an 'unknown field' comment has already been issued in validate_record
+            pass
+
+
+def validate_record_against_rec_type(config, record, commentary, pending):
+    if 'validate' in config:
+        config['validate'](record, commentary, pending)
+
+
+def validate_record(record):
+    version = record.rec_headers.protocol.split('/', 1)[1]  # XXX not exported?
+
+    record_id = record.rec_headers.get_header('WARC-Record-ID')
+    rec_type = record.rec_headers.get_header('WARC-Type')
+    if record_id is None:
+        print('no WARC-Record-ID seen, skipping validation', file=sys.stderr)
+        return
+    commentary = Commentary(record_id, rec_type)
+    pending = None
+
+    seen_fields = set()
+    for field, value in record.rec_headers.headers:
+        field_case = field
+        field = field.lower()
+        if field != 'warc-concurrent-to' and field in seen_fields:
+            commentary.error('duplicate field seen', field, value)
+        if field not in warc_fields:
+            commentary.comment('unknown field, no validation performed', field_case, value)
+            continue
+        config = warc_fields[field]
+        if 'minver' in config:
+            if version < config['minver']:
+                # unknown fields are extensions, so this is a comment and not an error
+                commentary.comment('field was introduced after this warc version', field_case, value, version)
+        if 'validate' in config:
+            config['validate'](field, value, record, version, commentary, pending)
+
+    # TODO: validate warc types: unknown should get a comment
+    if rec_type not in record_types:
+        commentary.comment('unknown record type, no validation performed', rec_type)
+    else:
+        validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary)
+        validate_record_against_rec_type(record_types[rec_type], record, commentary, pending)
+
+    return commentary
+
+
+def _process_one(warc):
+    if warc.endswith('.arc') or warc.endswith('.arc.gz'):
+        return
+    with open(warc, 'rb') as stream:
+        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
+
+            try:
+                record = WrapRecord(record)
+                digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
+                                  record.rec_headers.get_header('WARC-Block-Digest'))
+
+                commentary = validate_record(record)
+
+                record.content  # make sure digests are checked
+                # XXX might need to read and digest the raw stream to check digests for chunked encoding?
+                # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
+            except Exception:
+                # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code
+                print('Caught exception in warcio test analysis code')
+                traceback.print_exc()
+                exit(1)
+
+            if commentary.has_comments() or record.digest_checker.passed is False:
+                print(' ', 'WARC-Record-ID', commentary.record_id())
+                print('   ', 'WARC-Type', commentary.rec_type())
+
+                if record.digest_checker.passed is True:
+                    print('    digest pass')
+                elif record.digest_checker.passed is None:
+                    if digest_present:
+                        print('    digest present but not checked')
+                    else:
+                        print('    digest not present')
+                for p in record.digest_checker.problems:
+                    print('   ', p)
+
+                if commentary.has_comments():
+                    for c in commentary.comments():
+                        print('   ', c)
+
+
+class Tester(object):
+    def __init__(self, cmd):
+        self.inputs = cmd.inputs
+        self.verbose = cmd.verbose
+        self.exit_value = 0
+
+    def process_all(self):
+        for warc in self.inputs:
+            print(warc)
+            try:
+                self.process_one(warc)
+            except Exception as e:
+                print('  saw exception '+str(e).rstrip(), file=sys.stderr)
+                print('  skipping rest of file', file=sys.stderr)
+        return self.exit_value
+
+    def process_one(self, filename):
+        _process_one(filename)

From ebb721f768e7b0e4c98c93820fcb6743c3c96025 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 25 Jan 2019 16:12:21 -0800
Subject: [PATCH 02/68] documentation

---
 README.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.rst b/README.rst
index d4990f06..513f97d8 100644
--- a/README.rst
+++ b/README.rst
@@ -368,6 +368,14 @@ of WARC records, if possible. An exit value of 1 indicates a failure.
 ``warcio check -v`` will print verbose output for each record in the
 WARC file.
 
+Test
+~~~~
+
+The ``warcio test`` command will check one or more WARC files against
+the WARC standard, giving commentary about standards violations,
+recommendations, and other issues.
+
+
 Recompress
 ~~~~~~~~~~
 

From 7aa060db1aa7c8d53c8c33fd3f716f0e1686ddf1 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 25 Jan 2019 16:42:58 -0800
Subject: [PATCH 03/68] tests

---
 test/test_archiveiterator.py | 2 +-
 test/test_cli.py             | 2 +-
 warcio/tester.py             | 5 +++--
 warcio/utils.py              | 6 +++---
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py
index 810ba73b..2015c63b 100644
--- a/test/test_archiveiterator.py
+++ b/test/test_archiveiterator.py
@@ -186,7 +186,7 @@ def test_corrects_wget_bug(self):
         with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record:
             assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
         with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record:
-            assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
+            assert record.rec_headers.get('WARC-Target-URI') == '<http://example.com/>'
 
     def _digests_mutilate_helper(self, contents, expected_t, expected_f, capsys, full_read=False):
         with pytest.raises(ArchiveLoadFailed):
diff --git a/test/test_cli.py b/test/test_cli.py
index dc643ec4..4aaa96fd 100644
--- a/test/test_cli.py
+++ b/test/test_cli.py
@@ -90,7 +90,7 @@ def test_check_valid():
 
 
 def test_check_invalid():
-    filenames = [get_test_file('example-digest.warc')]
+    filenames = [get_test_file('example-digest-bad.warc')]
 
     args = ['check'] + filenames
     value = check_helper(args, 1)
diff --git a/warcio/tester.py b/warcio/tester.py
index 800f797e..de456dc8 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -6,6 +6,7 @@
 import traceback
 
 from warcio.archiveiterator import WARCIterator
+from warcio.utils import to_native_str
 
 
 class Commentary:
@@ -69,10 +70,10 @@ def validate_warc_fields(record, commentary):
 
     content = record.content
     try:
-        text = content.decode('utf-8', errors='strict')
+        text = to_native_str(content, 'utf-8', errors='strict')
     except UnicodeDecodeError as e:
         commentary.error('warc-fields contains invalid utf-8: '+str(e))
-        text = content.decode('utf-8', errors='replace')
+        text = to_native_str(content, 'utf-8', errors='replace')
 
     first_line = True
     lines = []
diff --git a/warcio/utils.py b/warcio/utils.py
index 23050548..6fd8a92f 100644
--- a/warcio/utils.py
+++ b/warcio/utils.py
@@ -13,14 +13,14 @@
 
 
 # #===========================================================================
-def to_native_str(value, encoding='utf-8'):
+def to_native_str(value, encoding='utf-8', errors='strict'):
     if isinstance(value, str):
         return value
 
     if six.PY3 and isinstance(value, six.binary_type):  #pragma: no cover
-        return value.decode(encoding)
+        return value.decode(encoding, errors)
     elif six.PY2 and isinstance(value, six.text_type):  #pragma: no cover
-        return value.encode(encoding)
+        return value.encode(encoding, errors)
     else:
         return value
 

From 24f300055ceaa32120e51c0a29d73a6f573b1e7d Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 25 Jan 2019 17:03:04 -0800
Subject: [PATCH 04/68] tests

---
 warcio/tester.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index de456dc8..386586bb 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -1,7 +1,6 @@
 from __future__ import print_function
 
 import re
-import ipaddress
 import sys
 import traceback
 
@@ -9,6 +8,14 @@
 from warcio.utils import to_native_str
 
 
+def try_ipaddress_init():
+    # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies.
+    try:
+        import ipaddress
+    except ImportError:  # pragma: no cover
+        pass
+
+
 class Commentary:
     def __init__(self, record_id, rec_type):
         self._record_id = record_id
@@ -325,6 +332,8 @@ def validate_ip(field, value, record, version, commentary, pending):
         ipaddress.ip_address(value)
     except ValueError:
         commentary.error('invalid ip', field, value)
+    except NameError:
+        commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 
 def validate_truncated(field, value, record, version, commentary, pending):
@@ -622,8 +631,8 @@ def _process_one(warc):
 class Tester(object):
     def __init__(self, cmd):
         self.inputs = cmd.inputs
-        self.verbose = cmd.verbose
         self.exit_value = 0
+        try_ipaddress_init()
 
     def process_all(self):
         for warc in self.inputs:

From 40f9fc66292bad2773a33a4c347f5e24280f9ad4 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 26 Jan 2019 08:39:22 -0800
Subject: [PATCH 05/68] coverage

---
 test/data/standard-torture-missing.warc       |   5 +
 .../standard-torture-validate-record.warc     |  79 ++++++++++
 test/test_tests.py                            | 149 ++++++++++++++++++
 warcio/tester.py                              |  79 ++++++----
 4 files changed, 278 insertions(+), 34 deletions(-)
 create mode 100644 test/data/standard-torture-missing.warc
 create mode 100644 test/data/standard-torture-validate-record.warc
 create mode 100644 test/test_tests.py

diff --git a/test/data/standard-torture-missing.warc b/test/data/standard-torture-missing.warc
new file mode 100644
index 00000000..a1ab0714
--- /dev/null
+++ b/test/data/standard-torture-missing.warc
@@ -0,0 +1,5 @@
+WARC/1.0
+WARC-Type: warcinfo
+Content-Length: 0
+
+
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
new file mode 100644
index 00000000..5181ea38
--- /dev/null
+++ b/test/data/standard-torture-validate-record.warc
@@ -0,0 +1,79 @@
+WARC/1.0
+WARC-Type: warcinfo
+Content-Type: application/warc-fields
+Content-Length: 146
+
+ first line can't start with a space
+test: invalid utf8 �(
+test: lines should end with \r\n
+foo:
+ bar
+
+no colon
+token cannot have a space:
+
+
+WARC/1.0
+WARC-Type: warcinfo
+Content-Type: application/warc-fields
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: HtTp://example.com/
+Content-Type: text/plain
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: resource
+WARC-Target-URI: DnS:asdfasdf
+Content-Type: text/plain
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: resource
+WARC-Target-URI: DnS:asdfasdf
+Content-Type: text/dns
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: request
+WARC-Target-URI: hTtP://example.com/
+Content-Type: text/plain
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: metadata
+Content-Type: application/warc-fields
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: revisit
+WARC-Profile: none
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: revisit
+WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: revisit
+WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: continuation
+WARC-Segment-Number: 1
+Content-Length: 0
+
+
diff --git a/test/test_tests.py b/test/test_tests.py
new file mode 100644
index 00000000..239d2461
--- /dev/null
+++ b/test/test_tests.py
@@ -0,0 +1,149 @@
+from warcio.cli import main
+
+from . import get_test_file
+from .test_cli import patch_stdout
+
+
+def helper(args, expected_exit_value):
+    with patch_stdout() as buff:
+        exit_value = None
+        try:
+            main(args=args)
+        except SystemExit as e:
+            exit_value = e.code
+        finally:
+            assert exit_value == expected_exit_value
+
+        return buff.getvalue()
+
+
+def remove_before_test_data(s):
+    ret = b''
+    for line in s.splitlines(True):
+        if b'/test/data/' in line:
+            line = b'test/data/' + line.split(b'/test/data/', 1)[1]
+        ret += line
+    return ret
+
+
+def test_torture_missing():
+    files = ['standard-torture-missing.warc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = b"""\
+test/data/standard-torture-missing.warc
+  WARC-Record-ID None
+    WARC-Type warcinfo
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    recommendation: warcinfo Content-Type of application/warc-fields, saw none
+"""
+
+    value = helper(args, 0)
+    assert remove_before_test_data(value) == expected
+
+
+def test_torture_validate_record():
+    files = ['standard-torture-validate-record.warc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = b"""\
+test/data/standard-torture-validate-record.warc
+  WARC-Record-ID None
+    WARC-Type warcinfo
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
+    comment: The first line of warc-fields cannot start with whitespace
+    comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
+    comment: Missing field-name : in warc-fields line: no colon
+    comment: invalid warc-fields name: token cannot have a space
+  WARC-Record-ID None
+    WARC-Type warcinfo
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    comment: warc-fields body present but empty
+  WARC-Record-ID None
+    WARC-Type response
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain
+    error: WARC-IP-Address should be used for http and https responses
+  WARC-Record-ID None
+    WARC-Type resource
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+  WARC-Record-ID None
+    WARC-Type resource
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+  WARC-Record-ID None
+    WARC-Type request
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID None
+    WARC-Type metadata
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    comment: warc-fields body present but empty
+  WARC-Record-ID None
+    WARC-Type revisit
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
+    comment: extension seen warc-profile none
+    comment: no revisit details validation done due to unknown profile
+  WARC-Record-ID None
+    WARC-Type revisit
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
+    error: missing required header WARC-Payload-Digest
+    recommendation: missing recommended header WARC-Refers-To
+    recommendation: missing recommended header WARC-Refers-To-Date
+    recommendation: missing recommended header WARC-Refers-To-Target-URI
+  WARC-Record-ID None
+    WARC-Type revisit
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
+    recommendation: missing recommended header WARC-Refers-To
+    recommendation: missing recommended header WARC-Refers-To-Date
+    comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified
+  WARC-Record-ID None
+    WARC-Type continuation
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Segment-Origin-ID
+    error: missing required header WARC-Target-URI
+    error: continuation record must have WARC-Segment-Number > 1, saw 1
+    comment: warcio test continuation code has not been tested, expect bugs
+"""
+
+    value = helper(args, 0)
+    print(remove_before_test_data(value).decode())
+    assert remove_before_test_data(value) == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index 386586bb..bdfe38f0 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -75,10 +75,10 @@ def validate_warc_fields(record, commentary):
     # field-value = *( field-content | LWS )  # LWS signals continuations
     # field-name = token  # token_re
 
-    content = record.content
+    content = record.content  # TESTME
     try:
         text = to_native_str(content, 'utf-8', errors='strict')
-    except UnicodeDecodeError as e:
+    except UnicodeDecodeError as e:  # TESTME
         commentary.error('warc-fields contains invalid utf-8: '+str(e))
         text = to_native_str(content, 'utf-8', errors='replace')
 
@@ -86,14 +86,14 @@ def validate_warc_fields(record, commentary):
     lines = []
     for line in text.splitlines(True):
         if not line.endswith('\r\n'):
-            commentary.error('warc-fields lines must end with \r\n')
+            commentary.comment('warc-fields lines must end with \\r\\n:', line.rstrip())
             line = line.rstrip('\r\n')
         else:
             line = line[:-2]
 
         if line.startswith(' ') or line.startswith('\t'):
             if first_line:
-                commentary.error('The first line of warc-fields cannot start with whitespace')
+                commentary.comment('The first line of warc-fields cannot start with whitespace')
             else:
                 lines[-1] += ' ' + line[1:]
         elif line == '':
@@ -102,22 +102,26 @@ def validate_warc_fields(record, commentary):
         else:
             # check for field-name :
             if ':' not in line:
-                commentary.error('Missing field-name : in warc-fields line', line)
+                commentary.comment('Missing field-name : in warc-fields line:', line)
             else:
                 field_name = line.split(':', 1)[0]
                 if not re.fullmatch(token_re, field_name):
-                    commentary('invalid warc-fields name', field_name)
+                    commentary.comment('invalid warc-fields name:', field_name)
                 else:
                     lines.append(line)
         first_line = False
 
+    if not lines:
+        commentary.comment('warc-fields body present but empty')
+        return
+
     # check known fields
 
 
 def validate_warcinfo(record, commentary, pending):
-    content_type = record.rec_headers.get_header('Content-Type')
+    content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
-        commentary.recommencation('warcinfo Content-Type of application/warc-fields, saw', content_type)
+        commentary.recommendation('warcinfo Content-Type of application/warc-fields, saw', content_type)
     else:
         #   format: warc-fields
         #   allowable fields include but not limited to DMCI plus the following
@@ -133,25 +137,27 @@ def validate_warcinfo(record, commentary, pending):
 
 
 def validate_response(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()  # TESTME
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
-        content_type = record.rec_headers.get_header('Content-Type')
+        content_type = record.rec_headers.get_header('Content-Type', 'none')
         if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}:
-            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw ', content_type)
+            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
 
         # error: http and https schemes should have http response headers
+        #   test by attempting to parse them?
+
         # comment: verify http content-length, if present -- commoncrawl nutch bug
 
 
 def validate_resource(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower()  # TESTME
 
     if target_uri.startswith('dns:'):
-        content_type = record.rec_headers.get_header('Content-Type')
+        content_type = record.rec_headers.get_header('Content-Type', 'none')
         if content_type.lower() != 'text/dns':
             commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type)
         else:
@@ -163,13 +169,13 @@ def validate_resource(record, commentary, pending):
 
 
 def validate_request(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()  # TESTME
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type')
 
         if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}:
-            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw ', content_type)
+            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https requests')
@@ -180,7 +186,7 @@ def validate_request(record, commentary, pending):
 
 
 def validate_metadata(record, commentary, pending):
-    content_type = record.rec_headers.get_header('Content-Type')
+    content_type = record.rec_headers.get_header('Content-Type', 'none')  # TESTME
     if content_type.lower() == 'application/warc-fields':
         # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6
         # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it
@@ -190,7 +196,7 @@ def validate_metadata(record, commentary, pending):
 
 
 def validate_revisit(record, commentary, pending):
-    warc_profile = record.rec_headers.get_header('WARC-Profile')
+    warc_profile = record.rec_headers.get_header('WARC-Profile', 'none')  # TESTME
 
     if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'):
         config = {
@@ -201,7 +207,7 @@ def validate_revisit(record, commentary, pending):
         # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired
         # recommended that server response headers be preserved "in this manner"
 
-    elif warc_profile.ends_with('/revisit/server-not-modified'):
+    elif warc_profile.endswith('/revisit/server-not-modified'):
         config = {
             'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'],
             'prohibited': ['WARC-Payload-Digest'],
@@ -216,15 +222,15 @@ def validate_revisit(record, commentary, pending):
 def validate_conversion(record, commentary, pending):
     # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment?
     # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To
-    pass
+    pass  # TESTME
 
 
 def validate_continuation(record, commentary, pending):
-    commentary.comment('warcio test continuation code has not been tested, expect bugs')
+    commentary.comment('warcio test continuation code has not been tested, expect bugs')  # TESTME
 
-    warc_type = record.rec_headers.get_header('WARC-Type')
-    if warc_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
-        commentary.recommendation('do not segment warc-type', warc_type)
+    segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none')
+    if segment_number.isdigit() and int(segment_number) < 2:
+        commentary.error('continuation record must have WARC-Segment-Number > 1, saw', segment_number)
 
     # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated
 
@@ -234,7 +240,7 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
     # should use a registered scheme
     # %XX encoding, normalize to upper case
     # schemes are case-insensitive and normalize to lower
-    if value.startswith('<') or value.endswith('>'):
+    if value.startswith('<') or value.endswith('>'):  # TESTME
         # wget 1.19 bug caused by WARC 1.0 spec error
         commentary.error('uri must not be within <>', field, value)
     if ':' not in value:
@@ -250,10 +256,10 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
 def validate_warc_type(field, value, record, version, commentary, pending):
     if not value.islower():
         # I am unclear if this is allowed? standard is silent
-        commentary.comment('Warc-Type is not lower-case', field, value)
+        commentary.comment('WARC-Type is not lower-case', field, value)
     if value.lower() not in record_types:
         # standard says readers should ignore unknown warc-types
-        commentary.comment('unknown Warc-Type', field, value)
+        commentary.comment('unknown WARC-Type', field, value)
 
 
 def validate_uri(field, value, record, version, commentary, pending):
@@ -307,8 +313,10 @@ def validate_content_type(field, value, record, version, commentary, pending):
         subtype = rest
     if not re.fullmatch(token_re, subtype, re.A):
         commentary.error('invalid subtype', field, value)
+
     # at this point there can be multiple parameters,
     # some of which could have quoted string values with ; in them
+
     # TODO: more checking
 
 
@@ -372,11 +380,17 @@ def validate_profile(field, value, record, version, commentary, pending):
 def validate_segment_number(field, value, record, version, commentary, pending):
     if not value.isdigit():
         commentary.error('must be an integer', field, value)
+        return
     iv = int(value)
     if iv == 0:
         commentary.error('must be 1 or greater', field, value)
-    # TODO: type != continuation must have iv == 1, else iv > 1
-    # might make that check in the 'continuation' section?
+
+    rec_type = record.rec_headers.get_header('WARC-Type', 'none')
+    if rec_type != 'continuation':
+        if iv != 1:
+            commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value)
+    elif rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
+        commentary.recommendation('do not segment warc-type', warc_type)
 
 
 def validate_segment_total_length(field, value, record, version, commentary, pending):
@@ -507,7 +521,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
     'continuation': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'],
-        'optional': [],
+        'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'],
         'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_continuation,
     },
@@ -522,10 +536,10 @@ def make_header_set(config, kinds):
 
 
 def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False):
-    for req in config.get('required', []):
+    for req in sorted(config.get('required', [])):
         if not rec_headers.get_header(req):
             commentary.error('missing required header', req)
-    for rec in config.get('recommended', []):
+    for rec in sorted(config.get('recommended', [])):
         if not rec_headers.get_header(rec):
             commentary.recommendation('missing recommended header', rec)
     allowed = make_header_set(config, ('required', 'optional', 'recommended'))
@@ -554,9 +568,6 @@ def validate_record(record):
 
     record_id = record.rec_headers.get_header('WARC-Record-ID')
     rec_type = record.rec_headers.get_header('WARC-Type')
-    if record_id is None:
-        print('no WARC-Record-ID seen, skipping validation', file=sys.stderr)
-        return
     commentary = Commentary(record_id, rec_type)
     pending = None
 

From c70e68eee640a86a9711223d8350a00f6301445a Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 26 Jan 2019 08:46:10 -0800
Subject: [PATCH 06/68] python 2.7 test fix

---
 warcio/tester.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index bdfe38f0..b74a3b03 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -105,7 +105,7 @@ def validate_warc_fields(record, commentary):
                 commentary.comment('Missing field-name : in warc-fields line:', line)
             else:
                 field_name = line.split(':', 1)[0]
-                if not re.fullmatch(token_re, field_name):
+                if not re.search(token_re, field_name):
                     commentary.comment('invalid warc-fields name:', field_name)
                 else:
                     lines.append(line)
@@ -248,7 +248,7 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
     if re.search(r'\s', value, re.A):
         commentary.error('invalid uri, contains whitespace', field, value)
     scheme, rest = value.split(':', 1)
-    if not re.fullmatch(r'[A-Za-z][A-Za-z0-9+\-\.]*', scheme, re.A):
+    if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme, re.A):
         commentary.error('invalid uri scheme, bad character', field, value)
     # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
 
@@ -283,7 +283,7 @@ def validate_timestamp(field, value, record, version, commentary, pending):
             commentary.error('WARC 1.0 may not have fractional seconds', field, value)
     else:
         start, end = value.split('.', 1)
-        if not re.fullmatch(r'[0-9]{1,9}Z', end, re.A):
+        if not re.search(r'\A[0-9]{1,9}Z\Z', end, re.A):
             commentary.error('fractional seconds must have 1-9 digits', field, value)
 
     # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
@@ -297,21 +297,21 @@ def validate_content_length(field, value, record, version, commentary, pending):
         commentary.error('must be an integer', field, value)
 
 
-token_re = r'[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+'
-digest_re = r'[A-Za-z0-9/+\-_=]+'
+token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z'
+digest_re = r'\A[A-Za-z0-9/+\-_=]+\Z'
 
 
 def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
         commentary.error('must contain a /', field, value)
     ctype, rest = value.split('/', 1)
-    if not re.fullmatch(token_re, ctype, re.A):
+    if not re.search(token_re, ctype, re.A):
         commentary.error('invalid type', field, value)
     if ';' in rest:
         subtype, rest = rest.split(';', 1)
     else:
         subtype = rest
-    if not re.fullmatch(token_re, subtype, re.A):
+    if not re.search(token_re, subtype, re.A):
         commentary.error('invalid subtype', field, value)
 
     # at this point there can be multiple parameters,
@@ -324,13 +324,13 @@ def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
         commentary.error('missing algorithm', field, value)
     algorithm, digest = value.split(':', 1)
-    if not re.fullmatch(token_re, algorithm, re.A):
+    if not re.search(token_re, algorithm, re.A):
         commentary.error('invalid algorithm', field, value)
-    if not re.fullmatch(token_re, digest, re.A):
+    if not re.search(token_re, digest, re.A):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
         pass
-    if not re.fullmatch(digest_re, digest, re.A):
+    if not re.search(digest_re, digest, re.A):
         commentary.comment('Invalid-looking digest value', field, value)
 
 

From 1847633cae2e221940e314c28036a0c4bdb4323b Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 26 Jan 2019 08:51:08 -0800
Subject: [PATCH 07/68] python 2.7 fixes

---
 warcio/tester.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index b74a3b03..c978a404 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -245,10 +245,10 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
         commentary.error('uri must not be within <>', field, value)
     if ':' not in value:
         commentary.error('invalid uri, no scheme', field, value)
-    if re.search(r'\s', value, re.A):
+    if re.search(r'\s', value):
         commentary.error('invalid uri, contains whitespace', field, value)
     scheme, rest = value.split(':', 1)
-    if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme, re.A):
+    if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
         commentary.error('invalid uri scheme, bad character', field, value)
     # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
 
@@ -283,7 +283,7 @@ def validate_timestamp(field, value, record, version, commentary, pending):
             commentary.error('WARC 1.0 may not have fractional seconds', field, value)
     else:
         start, end = value.split('.', 1)
-        if not re.search(r'\A[0-9]{1,9}Z\Z', end, re.A):
+        if not re.search(r'\A[0-9]{1,9}Z\Z', end):
             commentary.error('fractional seconds must have 1-9 digits', field, value)
 
     # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
@@ -305,13 +305,13 @@ def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
         commentary.error('must contain a /', field, value)
     ctype, rest = value.split('/', 1)
-    if not re.search(token_re, ctype, re.A):
+    if not re.search(token_re, ctype):
         commentary.error('invalid type', field, value)
     if ';' in rest:
         subtype, rest = rest.split(';', 1)
     else:
         subtype = rest
-    if not re.search(token_re, subtype, re.A):
+    if not re.search(token_re, subtype):
         commentary.error('invalid subtype', field, value)
 
     # at this point there can be multiple parameters,
@@ -324,13 +324,13 @@ def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
         commentary.error('missing algorithm', field, value)
     algorithm, digest = value.split(':', 1)
-    if not re.search(token_re, algorithm, re.A):
+    if not re.search(token_re, algorithm):
         commentary.error('invalid algorithm', field, value)
-    if not re.search(token_re, digest, re.A):
+    if not re.search(token_re, digest):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
         pass
-    if not re.search(digest_re, digest, re.A):
+    if not re.search(digest_re, digest):
         commentary.comment('Invalid-looking digest value', field, value)
 
 

From 2c676db25daf0abe842ddafd5fa058360ae218f0 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 26 Jan 2019 11:08:05 -0800
Subject: [PATCH 08/68] coverage

---
 .../data/standard-torture-validate-field.warc |  52 ++++++++
 .../standard-torture-validate-record.warc     |   5 +
 test/test_tests.py                            | 123 +++++++++++++++++-
 warcio/tester.py                              |  73 +++++++----
 4 files changed, 219 insertions(+), 34 deletions(-)
 create mode 100644 test/data/standard-torture-validate-field.warc

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
new file mode 100644
index 00000000..2c28d72d
--- /dev/null
+++ b/test/data/standard-torture-validate-field.warc
@@ -0,0 +1,52 @@
+WARC/1.0
+WARC-Target-URI: <http://example.com/>
+WARC-Target-URI: example.com
+WARC-Target-URI: ex ample.com
+WARC-Target-URI: h<>ttp://example.com/
+WARC-Type: does-not-exist
+WARC-Type: CAPITALIZED
+WARC-Concurrent-To: http://example.com/
+WARC-Record-ID: <foo:bar>
+WARC-Date: 2017-03-06T04:03:53Z
+WARC-Date: 2017-03-06T04:03:53.Z
+Content-Type: asdf
+Content-Type: has space/asdf
+Content-Type: asdf/has space
+Content-Type: asdf/has space;asdf
+WARC-Block-Digest: asdf
+WARC-Block-Digest: has space:asdf
+WARC-Block-Digest: sha1:&$*^&*^#*&^
+WARC-IP-Address: 1.2.3.4.5
+WARC-Truncated: invalid
+WARC-Warcinfo-ID: asdf:asdf
+WARC-Filename: not-yet-tested
+WARC-Profile: asdf
+WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+WARC-Identified-Payload-Type: asdf
+WARC-Segment-Origin-ID: http://example.com
+WARC-Segment-Number: not-an-integer
+WARC-Segment-Number: 0
+WARC-Segment-Number: 1
+WARC-Segment-Number: 2
+WARC-Segment-Total-Length: 0
+WARC-Segment-Total-Length: not-an-integer
+WARC-Refers-To-Target-URI: http://example.com
+WARC-Refers-To-Date: not-a-date
+WARC-Unknown-Field: asdf
+Content-Length: 0
+
+
+WARC/1.1
+WARC-Date: 2017-03-06T04:03:53Z
+WARC-Date: 2017-03-06T04:03:53.Z
+WARC-Type: invalid
+Content-Length: 0
+
+
+WARC/1.1
+WARC-Type: request
+WARC-Segment-Number: 1
+Content-Length: 0
+
+
+WARC/invalid
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index 5181ea38..d212f370 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -71,6 +71,11 @@ WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: conversion
+Content-Length: 0
+
+
 WARC/1.0
 WARC-Type: continuation
 WARC-Segment-Number: 1
diff --git a/test/test_tests.py b/test/test_tests.py
index 239d2461..19b7e377 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -2,6 +2,7 @@
 
 from . import get_test_file
 from .test_cli import patch_stdout
+from warcio.utils import to_native_str
 
 
 def helper(args, expected_exit_value):
@@ -14,14 +15,14 @@ def helper(args, expected_exit_value):
         finally:
             assert exit_value == expected_exit_value
 
-        return buff.getvalue()
+        return to_native_str(buff.getvalue())
 
 
 def remove_before_test_data(s):
-    ret = b''
+    ret = ''
     for line in s.splitlines(True):
-        if b'/test/data/' in line:
-            line = b'test/data/' + line.split(b'/test/data/', 1)[1]
+        if '/test/data/' in line:
+            line = 'test/data/' + line.split('/test/data/', 1)[1]
         ret += line
     return ret
 
@@ -33,7 +34,7 @@ def test_torture_missing():
     args = ['test']
     args.extend(files)
 
-    expected = b"""\
+    expected = """\
 test/data/standard-torture-missing.warc
   WARC-Record-ID None
     WARC-Type warcinfo
@@ -55,7 +56,7 @@ def test_torture_validate_record():
     args = ['test']
     args.extend(files)
 
-    expected = b"""\
+    expected = """\
 test/data/standard-torture-validate-record.warc
   WARC-Record-ID None
     WARC-Type warcinfo
@@ -85,6 +86,7 @@ def test_torture_validate_record():
     digest not present
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
+    error: recource records for dns: shall have Content-Type of text/dns, saw text/plain
   WARC-Record-ID None
     WARC-Type resource
     digest not present
@@ -133,6 +135,12 @@ def test_torture_validate_record():
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
     comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified
+  WARC-Record-ID None
+    WARC-Type conversion
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
   WARC-Record-ID None
     WARC-Type continuation
     digest not present
@@ -145,5 +153,106 @@ def test_torture_validate_record():
 """
 
     value = helper(args, 0)
-    print(remove_before_test_data(value).decode())
+    print(remove_before_test_data(value))
+    assert remove_before_test_data(value) == expected
+
+
+def test_torture_validate_field():
+    files = ['standard-torture-validate-field.warc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = """\
+test/data/standard-torture-validate-field.warc
+  WARC-Record-ID <foo:bar>
+    WARC-Type does-not-exist
+    unknown hash algorithm name in block digest
+    error: uri must not be within <> warc-target-uri <http://example.com/>
+    error: invalid uri scheme, bad character warc-target-uri <http://example.com/>
+    error: duplicate field seen warc-target-uri example.com
+    error: invalid uri, no scheme warc-target-uri example.com
+    error: duplicate field seen warc-target-uri ex ample.com
+    error: invalid uri, no scheme warc-target-uri ex ample.com
+    error: invalid uri, contains whitespace warc-target-uri ex ample.com
+    error: invalid uri scheme, bad character warc-target-uri ex ample.com
+    error: duplicate field seen warc-target-uri h<>ttp://example.com/
+    error: invalid uri scheme, bad character warc-target-uri h<>ttp://example.com/
+    error: duplicate field seen warc-type CAPITALIZED
+    error: uri must be within <> warc-concurrent-to http://example.com/
+    error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
+    error: WARC 1.0 may not have fractional seconds warc-date 2017-03-06T04:03:53.Z
+    error: must contain a / content-type asdf
+    error: invalid subtype content-type asdf
+    error: duplicate field seen content-type has space/asdf
+    error: invalid type content-type has space/asdf
+    error: duplicate field seen content-type asdf/has space
+    error: invalid subtype content-type asdf/has space
+    error: duplicate field seen content-type asdf/has space;asdf
+    error: invalid subtype content-type asdf/has space;asdf
+    error: missing algorithm warc-block-digest asdf
+    error: duplicate field seen warc-block-digest has space:asdf
+    error: invalid algorithm warc-block-digest has space:asdf
+    error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^
+    error: uri must be within <> warc-warcinfo-id asdf:asdf
+    error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    error: must contain a / warc-identified-payload-type asdf
+    error: invalid subtype warc-identified-payload-type asdf
+    error: uri must be within <> warc-segment-origin-id http://example.com
+    error: must be an integer warc-segment-number not-an-integer
+    error: duplicate field seen warc-segment-number 0
+    error: must be 1 or greater warc-segment-number 0
+    error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 0
+    error: duplicate field seen warc-segment-number 1
+    error: duplicate field seen warc-segment-number 2
+    error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 2
+    error: duplicate field seen warc-segment-total-length not-an-integer
+    error: must be an integer warc-segment-total-length not-an-integer
+    comment: unknown WARC-Type warc-type does-not-exist
+    comment: WARC-Type is not lower-case warc-type CAPITALIZED
+    comment: unknown WARC-Type warc-type CAPITALIZED
+    comment: unknown digest algorithm warc-block-digest asdf
+    comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^
+    comment: did not check ip address format, install ipaddress module from pypi if you care
+    comment: extension seen warc-truncated invalid
+    comment: extension seen warc-profile asdf
+    comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0
+    comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0
+    comment: unknown field, no validation performed WARC-Unknown-Field asdf
+  WARC-Record-ID None
+    WARC-Type invalid
+    digest not present
+    error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
+    error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z
+    comment: unknown WARC-Type warc-type invalid
+  WARC-Record-ID None
+    WARC-Type request
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
+    recommendation: do not segment WARC-Type request
+    comment: no configuration seen for WARC-Segment-Number request
+"""
+
+    value = helper(args, 0)
+    print(remove_before_test_data(value))
+    assert remove_before_test_data(value) == expected
+
+
+def test_arc():
+    files = ['does-not-exist.arc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = """\
+test/data/does-not-exist.arc
+"""
+
+    value = helper(args, 0)
     assert remove_before_test_data(value) == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index c978a404..4c2f8299 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -5,15 +5,15 @@
 import traceback
 
 from warcio.archiveiterator import WARCIterator
-from warcio.utils import to_native_str
+from warcio.utils import to_native_str, Digester
 
 
-def try_ipaddress_init():
+def try_ipaddress_import():
     # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies.
     try:
         import ipaddress
     except ImportError:  # pragma: no cover
-        pass
+        print('ipaddress module not imported')
 
 
 class Commentary:
@@ -75,10 +75,10 @@ def validate_warc_fields(record, commentary):
     # field-value = *( field-content | LWS )  # LWS signals continuations
     # field-name = token  # token_re
 
-    content = record.content  # TESTME
+    content = record.content
     try:
         text = to_native_str(content, 'utf-8', errors='strict')
-    except UnicodeDecodeError as e:  # TESTME
+    except UnicodeDecodeError as e:
         commentary.error('warc-fields contains invalid utf-8: '+str(e))
         text = to_native_str(content, 'utf-8', errors='replace')
 
@@ -137,7 +137,7 @@ def validate_warcinfo(record, commentary, pending):
 
 
 def validate_response(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()  # TESTME
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
@@ -154,7 +154,7 @@ def validate_response(record, commentary, pending):
 
 
 def validate_resource(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower()  # TESTME
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower()
 
     if target_uri.startswith('dns:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
@@ -169,7 +169,7 @@ def validate_resource(record, commentary, pending):
 
 
 def validate_request(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()  # TESTME
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type')
@@ -186,7 +186,7 @@ def validate_request(record, commentary, pending):
 
 
 def validate_metadata(record, commentary, pending):
-    content_type = record.rec_headers.get_header('Content-Type', 'none')  # TESTME
+    content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() == 'application/warc-fields':
         # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6
         # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it
@@ -196,7 +196,7 @@ def validate_metadata(record, commentary, pending):
 
 
 def validate_revisit(record, commentary, pending):
-    warc_profile = record.rec_headers.get_header('WARC-Profile', 'none')  # TESTME
+    warc_profile = record.rec_headers.get_header('WARC-Profile', 'none')
 
     if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'):
         config = {
@@ -222,11 +222,11 @@ def validate_revisit(record, commentary, pending):
 def validate_conversion(record, commentary, pending):
     # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment?
     # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To
-    pass  # TESTME
+    pass
 
 
 def validate_continuation(record, commentary, pending):
-    commentary.comment('warcio test continuation code has not been tested, expect bugs')  # TESTME
+    commentary.comment('warcio test continuation code has not been tested, expect bugs')
 
     segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none')
     if segment_number.isdigit() and int(segment_number) < 2:
@@ -240,14 +240,14 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
     # should use a registered scheme
     # %XX encoding, normalize to upper case
     # schemes are case-insensitive and normalize to lower
-    if value.startswith('<') or value.endswith('>'):  # TESTME
+    if value.startswith('<') or value.endswith('>'):
         # wget 1.19 bug caused by WARC 1.0 spec error
         commentary.error('uri must not be within <>', field, value)
     if ':' not in value:
         commentary.error('invalid uri, no scheme', field, value)
     if re.search(r'\s', value):
         commentary.error('invalid uri, contains whitespace', field, value)
-    scheme, rest = value.split(':', 1)
+    scheme = value.split(':', 1)[0]
     if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
         commentary.error('invalid uri scheme, bad character', field, value)
     # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
@@ -282,9 +282,10 @@ def validate_timestamp(field, value, record, version, commentary, pending):
             # XXX specification infelicity: would be nice to have 'advice to implementers' here
             commentary.error('WARC 1.0 may not have fractional seconds', field, value)
     else:
-        start, end = value.split('.', 1)
-        if not re.search(r'\A[0-9]{1,9}Z\Z', end):
-            commentary.error('fractional seconds must have 1-9 digits', field, value)
+        if '.' in value:
+            start, end = value.split('.', 1)
+            if not re.search(r'\A[0-9]{1,9}Z\Z', end):
+                commentary.error('fractional seconds must have 1-9 digits', field, value)
 
     # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
 
@@ -304,7 +305,12 @@ def validate_content_length(field, value, record, version, commentary, pending):
 def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
         commentary.error('must contain a /', field, value)
-    ctype, rest = value.split('/', 1)
+    splits = value.split('/', 1)
+    ctype = splits[0]
+    if len(splits) > 1:
+        rest = splits[1]
+    else:
+        rest = ''
     if not re.search(token_re, ctype):
         commentary.error('invalid type', field, value)
     if ';' in rest:
@@ -323,9 +329,19 @@ def validate_content_type(field, value, record, version, commentary, pending):
 def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
         commentary.error('missing algorithm', field, value)
-    algorithm, digest = value.split(':', 1)
+    splits = value.split(':', 1)
+    algorithm = splits[0]
+    if len(splits) > 1:
+        digest = splits[1]
+    else:
+        digest = 'none'
     if not re.search(token_re, algorithm):
         commentary.error('invalid algorithm', field, value)
+    else:
+        try:
+            Digester(algorithm)
+        except ValueError:
+            commentary.comment('unknown digest algorithm', field, value)
     if not re.search(token_re, digest):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
@@ -389,8 +405,8 @@ def validate_segment_number(field, value, record, version, commentary, pending):
     if rec_type != 'continuation':
         if iv != 1:
             commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value)
-    elif rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
-        commentary.recommendation('do not segment warc-type', warc_type)
+    if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
+        commentary.recommendation('do not segment WARC-Type', rec_type)
 
 
 def validate_segment_total_length(field, value, record, version, commentary, pending):
@@ -418,7 +434,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_uri,
     },
     'WARC-Block-Digest': {
-        'validate': validate_digest,  # openssl check? or just let check_digest get it?
+        'validate': validate_digest,
     },
     'WARC-Payload-Digest': {
         'validate': validate_digest,
@@ -487,6 +503,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_resource,
     },
     'request': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
@@ -577,6 +594,7 @@ def validate_record(record):
         field = field.lower()
         if field != 'warc-concurrent-to' and field in seen_fields:
             commentary.error('duplicate field seen', field, value)
+        seen_fields.add(field)
         if field not in warc_fields:
             commentary.comment('unknown field, no validation performed', field_case, value)
             continue
@@ -588,9 +606,8 @@ def validate_record(record):
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 
-    # TODO: validate warc types: unknown should get a comment
     if rec_type not in record_types:
-        commentary.comment('unknown record type, no validation performed', rec_type)
+        pass  # we print a comment for this elsewhere
     else:
         validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary)
         validate_record_against_rec_type(record_types[rec_type], record, commentary, pending)
@@ -614,7 +631,7 @@ def _process_one(warc):
                 record.content  # make sure digests are checked
                 # XXX might need to read and digest the raw stream to check digests for chunked encoding?
                 # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
-            except Exception:
+            except Exception:  # pragma: no cover
                 # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code
                 print('Caught exception in warcio test analysis code')
                 traceback.print_exc()
@@ -643,7 +660,6 @@ class Tester(object):
     def __init__(self, cmd):
         self.inputs = cmd.inputs
         self.exit_value = 0
-        try_ipaddress_init()
 
     def process_all(self):
         for warc in self.inputs:
@@ -651,9 +667,12 @@ def process_all(self):
             try:
                 self.process_one(warc)
             except Exception as e:
-                print('  saw exception '+str(e).rstrip(), file=sys.stderr)
+                print('  saw exception '+repr(e).rstrip(), file=sys.stderr)
                 print('  skipping rest of file', file=sys.stderr)
         return self.exit_value
 
     def process_one(self, filename):
         _process_one(filename)
+
+
+try_ipaddress_import()

From 97ee457f02f5b1508577e8d9118703a7b9b814f9 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 27 Jan 2019 15:25:36 -0800
Subject: [PATCH 09/68] py2 testing

---
 test/test_tests.py | 23 ++++++++++++++++++++---
 warcio/tester.py   | 14 ++------------
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index 19b7e377..a197c3ba 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -1,8 +1,10 @@
+import six
+
 from warcio.cli import main
+from warcio.utils import to_native_str
 
 from . import get_test_file
 from .test_cli import patch_stdout
-from warcio.utils import to_native_str
 
 
 def helper(args, expected_exit_value):
@@ -154,7 +156,13 @@ def test_torture_validate_record():
 
     value = helper(args, 0)
     print(remove_before_test_data(value))
-    assert remove_before_test_data(value) == expected
+
+    ret = remove_before_test_data(value)
+
+    if six.PY2:
+        expected = expected.replace('\n    error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n')
+
+    assert ret == expected
 
 
 def test_torture_validate_field():
@@ -195,6 +203,7 @@ def test_torture_validate_field():
     error: duplicate field seen warc-block-digest has space:asdf
     error: invalid algorithm warc-block-digest has space:asdf
     error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^
+    error: invalid ip warc-ip-address 1.2.3.4.5
     error: uri must be within <> warc-warcinfo-id asdf:asdf
     error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
     error: must contain a / warc-identified-payload-type asdf
@@ -214,7 +223,6 @@ def test_torture_validate_field():
     comment: unknown WARC-Type warc-type CAPITALIZED
     comment: unknown digest algorithm warc-block-digest asdf
     comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^
-    comment: did not check ip address format, install ipaddress module from pypi if you care
     comment: extension seen warc-truncated invalid
     comment: extension seen warc-profile asdf
     comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
@@ -240,6 +248,15 @@ def test_torture_validate_field():
 
     value = helper(args, 0)
     print(remove_before_test_data(value))
+
+    ret = remove_before_test_data(value)
+    if six.PY2:
+        if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret:
+            # user did not install ipaddress module
+            expected = expected.replace('\n    error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n')
+            ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
+
+
     assert remove_before_test_data(value) == expected
 
 
diff --git a/warcio/tester.py b/warcio/tester.py
index 4c2f8299..308f35fd 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -8,14 +8,6 @@
 from warcio.utils import to_native_str, Digester
 
 
-def try_ipaddress_import():
-    # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies.
-    try:
-        import ipaddress
-    except ImportError:  # pragma: no cover
-        print('ipaddress module not imported')
-
-
 class Commentary:
     def __init__(self, record_id, rec_type):
         self._record_id = record_id
@@ -353,10 +345,11 @@ def validate_digest(field, value, record, version, commentary, pending):
 def validate_ip(field, value, record, version, commentary, pending):
     # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291
     try:
+        import ipaddress
         ipaddress.ip_address(value)
     except ValueError:
         commentary.error('invalid ip', field, value)
-    except NameError:
+    except (ImportError, NameError):
         commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 
@@ -673,6 +666,3 @@ def process_all(self):
 
     def process_one(self, filename):
         _process_one(filename)
-
-
-try_ipaddress_import()

From df50151df2a21b00fa30ed592e6b2536f1367f97 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 27 Jan 2019 15:35:30 -0800
Subject: [PATCH 10/68] py2 windows testing

---
 test/test_tests.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index a197c3ba..01e72ef4 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -25,6 +25,8 @@ def remove_before_test_data(s):
     for line in s.splitlines(True):
         if '/test/data/' in line:
             line = 'test/data/' + line.split('/test/data/', 1)[1]
+        if '\\test\\data\\' in line:
+            line = 'test/data/' + line.split('\\test\\data\\', 1)[1]
         ret += line
     return ret
 
@@ -247,17 +249,16 @@ def test_torture_validate_field():
 """
 
     value = helper(args, 0)
-    print(remove_before_test_data(value))
-
     ret = remove_before_test_data(value)
+
     if six.PY2:
         if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret:
             # user did not install ipaddress module
             expected = expected.replace('\n    error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n')
             ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
 
-
-    assert remove_before_test_data(value) == expected
+    print(ret)
+    assert ret == expected
 
 
 def test_arc():

From 858a752021640a47f716522bb5ef1fdd30d3fb82 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 27 Jan 2019 23:11:46 -0800
Subject: [PATCH 11/68] coverage

---
 .../standard-torture-validate-record.warc     |  1 +
 test/test_tests.py                            | 55 ++++++++++++++++++-
 warcio/tester.py                              | 52 +++++++++---------
 3 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index d212f370..08a39e50 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -1,6 +1,7 @@
 WARC/1.0
 WARC-Type: warcinfo
 Content-Type: application/warc-fields
+WARC-Refers-To: probhibited
 Content-Length: 146
 
  first line can't start with a space
diff --git a/test/test_tests.py b/test/test_tests.py
index 01e72ef4..0fdecc74 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -2,6 +2,7 @@
 
 from warcio.cli import main
 from warcio.utils import to_native_str
+import warcio.tester
 
 from . import get_test_file
 from .test_cli import patch_stdout
@@ -65,8 +66,10 @@ def test_torture_validate_record():
   WARC-Record-ID None
     WARC-Type warcinfo
     digest not present
+    error: uri must be within <> warc-refers-to probhibited
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
+    error: field not allowed in record_type WARC-Refers-To warcinfo
     error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
     comment: The first line of warc-fields cannot start with whitespace
     comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
@@ -129,6 +132,7 @@ def test_torture_validate_record():
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
     recommendation: missing recommended header WARC-Refers-To-Target-URI
+    comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
   WARC-Record-ID None
     WARC-Type revisit
     digest not present
@@ -138,7 +142,6 @@ def test_torture_validate_record():
     error: missing required header WARC-Target-URI
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
-    comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified
   WARC-Record-ID None
     WARC-Type conversion
     digest not present
@@ -227,7 +230,6 @@ def test_torture_validate_field():
     comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^
     comment: extension seen warc-truncated invalid
     comment: extension seen warc-profile asdf
-    comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
     comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0
     comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0
     comment: unknown field, no validation performed WARC-Unknown-Field asdf
@@ -274,3 +276,52 @@ def test_arc():
 
     value = helper(args, 0)
     assert remove_before_test_data(value) == expected
+
+
+def test_digests():
+    # needed for test coverage
+    files = ['example-digest-bad.warc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = """\
+test/data/example-digest-bad.warc
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+"""
+
+    value = helper(args, 0)
+    assert remove_before_test_data(value) == expected
+
+
+def test_leftovers():
+    commentary = warcio.tester.Commentary('id', 'type')
+
+    # hard to test because invalid WARC Content-Length raises in archiveiterator
+    warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None)
+
+    # hard to test because warcio checks the WARC version
+    warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
+
+    expected = '''\
+error: must be an integer content-length not-an-integer
+comment: no profile check because unknown warc version blah blah
+'''
+
+    assert '\n'.join(commentary.comments())+'\n' == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index 308f35fd..de9f3ca1 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -2,10 +2,10 @@
 
 import re
 import sys
-import traceback
 
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
+from warcio.exceptions import ArchiveLoadFailed
 
 
 class Commentary:
@@ -196,8 +196,11 @@ def validate_revisit(record, commentary, pending):
             'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],
         }
         validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
-        # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired
-        # recommended that server response headers be preserved "in this manner"
+        # may have record block;
+        #  if not, shall have Content-Length: 0,
+        #  if yes, should be like a response record, truncated FOR LENGTH ONLY if desired
+        #  recommended that server response headers be preserved "in this manner"
+        #   I suppose that means headers are required if there is any content?!
 
     elif warc_profile.endswith('/revisit/server-not-modified'):
         config = {
@@ -205,7 +208,9 @@ def validate_revisit(record, commentary, pending):
             'prohibited': ['WARC-Payload-Digest'],
         }
         validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
-        #   may have content body; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated if desired
+        #   may have content body;
+        #     if not, shall have Content-Length: 0,
+        #     if yes, should be like a response record, truncated if desired
         #   WARC-Refers-To-Date should be the same as WARC-Date in the original record if present
     else:
         commentary.comment('no revisit details validation done due to unknown profile')
@@ -343,13 +348,12 @@ def validate_digest(field, value, record, version, commentary, pending):
 
 
 def validate_ip(field, value, record, version, commentary, pending):
-    # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291
     try:
         import ipaddress
         ipaddress.ip_address(value)
     except ValueError:
         commentary.error('invalid ip', field, value)
-    except (ImportError, NameError):
+    except (ImportError, NameError):  # pragma: no cover (for python 2.7)
         commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 
@@ -369,12 +373,14 @@ def validate_filename(field, value, record, version, commentary, pending):
 
 
 profiles = {
-    '1.0': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
-            'http://netpreserve.org/warc/1.1/revisit/server-not-modified',
+    # XXX WARC/0.17 and WARC/0.18
+    '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest',
+            'http://netpreserve.org/warc/1.0/revisit/server-not-modified',
             # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not?
+            # https://github.com/iipc/webarchive-commons/commits/988bec707c27a01333becfc3bd502af4441ea1e1/src/main/java/org/archive/format/warc/WARCConstants.java
             'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'],
-    '1.1': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest',
-            'http://netpreserve.org/warc/1.0/revisit/server-not-modified'],
+    '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
+            'http://netpreserve.org/warc/1.1/revisit/server-not-modified'],
 }
 
 
@@ -614,21 +620,15 @@ def _process_one(warc):
     with open(warc, 'rb') as stream:
         for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
 
-            try:
-                record = WrapRecord(record)
-                digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
-                                  record.rec_headers.get_header('WARC-Block-Digest'))
+            record = WrapRecord(record)
+            digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
+                              record.rec_headers.get_header('WARC-Block-Digest'))
 
-                commentary = validate_record(record)
+            commentary = validate_record(record)
 
-                record.content  # make sure digests are checked
-                # XXX might need to read and digest the raw stream to check digests for chunked encoding?
-                # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
-            except Exception:  # pragma: no cover
-                # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code
-                print('Caught exception in warcio test analysis code')
-                traceback.print_exc()
-                exit(1)
+            record.content  # make sure digests are checked
+            # XXX might need to read and digest the raw stream to check digests for chunked encoding?
+            # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
 
             if commentary.has_comments() or record.digest_checker.passed is False:
                 print(' ', 'WARC-Record-ID', commentary.record_id())
@@ -637,7 +637,7 @@ def _process_one(warc):
                 if record.digest_checker.passed is True:
                     print('    digest pass')
                 elif record.digest_checker.passed is None:
-                    if digest_present:
+                    if digest_present:  # pragma: no cover
                         print('    digest present but not checked')
                     else:
                         print('    digest not present')
@@ -659,8 +659,8 @@ def process_all(self):
             print(warc)
             try:
                 self.process_one(warc)
-            except Exception as e:
-                print('  saw exception '+repr(e).rstrip(), file=sys.stderr)
+            except ArchiveLoadFailed as e:
+                print('  saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr)
                 print('  skipping rest of file', file=sys.stderr)
         return self.exit_value
 

From 5bfffea4c2200284cb876c8700b5ac578a4ec544 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 27 Jan 2019 23:57:53 -0800
Subject: [PATCH 12/68] branch coverage

---
 .../data/standard-torture-validate-field.warc |  1 +
 .../standard-torture-validate-record.warc     | 26 +++++++++++
 test/test_tests.py                            | 44 ++++++++++++++++++-
 warcio/tester.py                              |  1 +
 4 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
index 2c28d72d..c88d3ee6 100644
--- a/test/data/standard-torture-validate-field.warc
+++ b/test/data/standard-torture-validate-field.warc
@@ -39,6 +39,7 @@ Content-Length: 0
 WARC/1.1
 WARC-Date: 2017-03-06T04:03:53Z
 WARC-Date: 2017-03-06T04:03:53.Z
+WARC-Date: 2017-03-06T04:03:53.0Z
 WARC-Type: invalid
 Content-Length: 0
 
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index 08a39e50..6f06205e 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -41,9 +41,23 @@ Content-Type: text/dns
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: resource
+WARC-Target-URI: foo:bar
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: request
+WARC-Target-URI: hTtP://example.com/
+Content-Type: text/plain
+Content-Length: 0
+
+
 WARC/1.0
 WARC-Type: request
 WARC-Target-URI: hTtP://example.com/
+WARC-IP-Address: 1.2.3.4
 Content-Type: text/plain
 Content-Length: 0
 
@@ -54,6 +68,12 @@ Content-Type: application/warc-fields
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: metadata
+Content-Type: not-application/warc-fields
+Content-Length: 0
+
+
 WARC/1.0
 WARC-Type: revisit
 WARC-Profile: none
@@ -83,3 +103,9 @@ WARC-Segment-Number: 1
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: continuation
+WARC-Segment-Number: 2
+Content-Length: 0
+
+
diff --git a/test/test_tests.py b/test/test_tests.py
index 0fdecc74..174466c8 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -99,6 +99,12 @@ def test_torture_validate_record():
     digest not present
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
+  WARC-Record-ID None
+    WARC-Type resource
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
   WARC-Record-ID None
     WARC-Type request
     digest not present
@@ -106,12 +112,23 @@ def test_torture_validate_record():
     error: missing required header WARC-Record-ID
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID None
+    WARC-Type request
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
   WARC-Record-ID None
     WARC-Type metadata
     digest not present
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
     comment: warc-fields body present but empty
+  WARC-Record-ID None
+    WARC-Type metadata
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
   WARC-Record-ID None
     WARC-Type revisit
     digest not present
@@ -157,6 +174,14 @@ def test_torture_validate_record():
     error: missing required header WARC-Target-URI
     error: continuation record must have WARC-Segment-Number > 1, saw 1
     comment: warcio test continuation code has not been tested, expect bugs
+  WARC-Record-ID None
+    WARC-Type continuation
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Segment-Origin-ID
+    error: missing required header WARC-Target-URI
+    comment: warcio test continuation code has not been tested, expect bugs
 """
 
     value = helper(args, 0)
@@ -238,6 +263,7 @@ def test_torture_validate_field():
     digest not present
     error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
     error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z
+    error: duplicate field seen warc-date 2017-03-06T04:03:53.0Z
     comment: unknown WARC-Type warc-type invalid
   WARC-Record-ID None
     WARC-Type request
@@ -280,7 +306,7 @@ def test_arc():
 
 def test_digests():
     # needed for test coverage
-    files = ['example-digest-bad.warc']
+    files = ['example-digest-bad.warc', 'example.warc']
     files = [get_test_file(filename) for filename in files]
 
     args = ['test']
@@ -304,6 +330,21 @@ def test_digests():
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
+test/data/example.warc
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest not present
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
+    WARC-Type revisit
+    digest present but not checked
+    recommendation: missing recommended header WARC-Refers-To
+    comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com/ 1.0
+    comment: field was introduced after this warc version WARC-Refers-To-Date 2017-03-06T04:02:06Z 1.0
+  WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest not present
+    error: WARC-IP-Address should be used for http and https requests
 """
 
     value = helper(args, 0)
@@ -312,6 +353,7 @@ def test_digests():
 
 def test_leftovers():
     commentary = warcio.tester.Commentary('id', 'type')
+    assert not commentary.has_comments()
 
     # hard to test because invalid WARC Content-Length raises in archiveiterator
     warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None)
diff --git a/warcio/tester.py b/warcio/tester.py
index de9f3ca1..eaf7f09f 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -638,6 +638,7 @@ def _process_one(warc):
                     print('    digest pass')
                 elif record.digest_checker.passed is None:
                     if digest_present:  # pragma: no cover
+                        # WARC record missing Content-Length: header, which is verboten
                         print('    digest present but not checked')
                     else:
                         print('    digest not present')

From bb31f14707b2019e6b406ffc2f7dc89af418d17e Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 00:05:31 -0800
Subject: [PATCH 13/68] py2 branch coverage

---
 test/test_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_tests.py b/test/test_tests.py
index 174466c8..98517308 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -191,6 +191,7 @@ def test_torture_validate_record():
 
     if six.PY2:
         expected = expected.replace('\n    error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n')
+        ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
 
     assert ret == expected
 

From cc542596826f3112965107229fd63eaabb077308 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 10:09:17 -0800
Subject: [PATCH 14/68] py2 testing

---
 setup.py           | 17 +++++++++++------
 test/test_tests.py | 20 +++++---------------
 warcio/tester.py   | 26 ++++++++++++++++++++------
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/setup.py b/setup.py
index 3a1dce58..57f402dc 100755
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,7 @@
 from setuptools import setup, find_packages
 from setuptools.command.test import test as TestCommand
 import glob
+import sys
 
 __version__ = '1.7.0.dev0'
 
@@ -21,6 +22,15 @@ def run_tests(self):
         errcode = pytest.main(['--doctest-module', './warcio', '--cov', 'warcio', '-v', 'test/'])
         sys.exit(errcode)
 
+tests_require = [
+    'pytest',
+    'pytest-cov',
+    'httpbin==0.5.0',
+    'requests',
+]
+if sys.version_info < (3, 3):
+    tests_require.append('ipaddress')
+
 setup(
     name='warcio',
     version=__version__,
@@ -44,12 +54,7 @@ def run_tests(self):
     """,
     cmdclass={'test': PyTest},
     test_suite='',
-    tests_require=[
-        'pytest',
-        'pytest-cov',
-        'httpbin==0.5.0',
-        'requests',
-    ],
+    tests_require=tests_require,
     classifiers=[
         'Development Status :: 4 - Beta',
         'Environment :: Web Environment',
diff --git a/test/test_tests.py b/test/test_tests.py
index 98517308..dab1e669 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -187,13 +187,9 @@ def test_torture_validate_record():
     value = helper(args, 0)
     print(remove_before_test_data(value))
 
-    ret = remove_before_test_data(value)
+    actual = remove_before_test_data(value)
 
-    if six.PY2:
-        expected = expected.replace('\n    error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n')
-        ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
-
-    assert ret == expected
+    assert actual == expected
 
 
 def test_torture_validate_field():
@@ -278,16 +274,10 @@ def test_torture_validate_field():
 """
 
     value = helper(args, 0)
-    ret = remove_before_test_data(value)
-
-    if six.PY2:
-        if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret:
-            # user did not install ipaddress module
-            expected = expected.replace('\n    error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n')
-            ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
+    actual = remove_before_test_data(value)
 
-    print(ret)
-    assert ret == expected
+    print(actual)
+    assert actual == expected
 
 
 def test_arc():
diff --git a/warcio/tester.py b/warcio/tester.py
index eaf7f09f..f00479ff 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -2,6 +2,7 @@
 
 import re
 import sys
+import six
 
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
@@ -68,11 +69,22 @@ def validate_warc_fields(record, commentary):
     # field-name = token  # token_re
 
     content = record.content
-    try:
-        text = to_native_str(content, 'utf-8', errors='strict')
-    except UnicodeDecodeError as e:
-        commentary.error('warc-fields contains invalid utf-8: '+str(e))
-        text = to_native_str(content, 'utf-8', errors='replace')
+
+    if six.PY2:  # pragma: no cover
+        try:
+            content.decode('utf-8', errors='strict')
+            text = content  # already a str
+        except UnicodeDecodeError as e:
+            err = str(e)
+            err = err.replace('utf8', 'utf-8')  # sigh
+            commentary.error('warc-fields contains invalid utf-8: '+err)
+            text = content.decode('utf-8', errors='replace')
+    else:  # pragma: no cover
+        try:
+            text = to_native_str(content, 'utf-8', errors='strict')
+        except UnicodeDecodeError as e:
+            commentary.error('warc-fields contains invalid utf-8: '+str(e))
+            text = to_native_str(content, 'utf-8', errors='replace')
 
     first_line = True
     lines = []
@@ -350,10 +362,12 @@ def validate_digest(field, value, record, version, commentary, pending):
 def validate_ip(field, value, record, version, commentary, pending):
     try:
         import ipaddress
+        if six.PY2:  # pragma: no cover
+            value = unicode(value)
         ipaddress.ip_address(value)
     except ValueError:
         commentary.error('invalid ip', field, value)
-    except (ImportError, NameError):  # pragma: no cover (for python 2.7)
+    except (ImportError, NameError):  # pragma: no cover
         commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 

From 2b8d596701e2805291be053cdd8a051200ae5fe6 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 11:15:57 -0800
Subject: [PATCH 15/68] add record ids to test

---
 test/data/standard-torture-missing.warc       |  5 -
 .../standard-torture-validate-record.warc     | 25 +++++
 test/test_tests.py                            | 91 ++++++++-----------
 warcio/tester.py                              |  2 +-
 4 files changed, 63 insertions(+), 60 deletions(-)
 delete mode 100644 test/data/standard-torture-missing.warc

diff --git a/test/data/standard-torture-missing.warc b/test/data/standard-torture-missing.warc
deleted file mode 100644
index a1ab0714..00000000
--- a/test/data/standard-torture-missing.warc
+++ /dev/null
@@ -1,5 +0,0 @@
-WARC/1.0
-WARC-Type: warcinfo
-Content-Length: 0
-
-
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index 6f06205e..fa03b38e 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -15,13 +15,24 @@ token cannot have a space:
 
 
 WARC/1.0
+WARC-Record-ID: test-empty-warc-fields
 WARC-Type: warcinfo
 Content-Type: application/warc-fields
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: warcinfo
+WARC-Record-ID: test-warcinfo-non-recommended-content-type
+Content-Type: not-application/warc-fields
+Content-Length: 5
+
+foo
+
+
 WARC/1.0
 WARC-Type: response
+WARC-Record-ID: test-response-content-type
 WARC-Target-URI: HtTp://example.com/
 Content-Type: text/plain
 Content-Length: 0
@@ -29,6 +40,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
+WARC-Record-ID: test-resource-dns-content-type
 WARC-Target-URI: DnS:asdfasdf
 Content-Type: text/plain
 Content-Length: 0
@@ -36,6 +48,8 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
+WARC-Record-ID: test-resource-dns-empty
+WARC-Test-TODO: add another with valid block
 WARC-Target-URI: DnS:asdfasdf
 Content-Type: text/dns
 Content-Length: 0
@@ -43,12 +57,14 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
+WARC-Record-ID: test-resource-not-dns
 WARC-Target-URI: foo:bar
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: request
+WARC-Record-ID: test-request-unrecommended-content-type
 WARC-Target-URI: hTtP://example.com/
 Content-Type: text/plain
 Content-Length: 0
@@ -56,6 +72,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: request
+WARC-Record-ID: test-request-unrecommended-content-type-with-ip
 WARC-Target-URI: hTtP://example.com/
 WARC-IP-Address: 1.2.3.4
 Content-Type: text/plain
@@ -64,47 +81,55 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: metadata
+WARC-Record-ID: test-metadata-warc-fields-empty
 Content-Type: application/warc-fields
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: metadata
+WARC-Record-ID: test-metadata-not-warc-fields
 Content-Type: not-application/warc-fields
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
+WARC-Record-ID: test-revisit-profile-unknown
 WARC-Profile: none
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
+WARC-Record-ID: test-revisit-profile-future
 WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
+WARC-Record-ID: test-revisit-profile-good
 WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: conversion
+WARC-Record-ID: test-conversion
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: continuation
+WARC-Record-ID: test-continuation-segment-1
 WARC-Segment-Number: 1
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: continuation
+WARC-Record-ID: test-continuation-segment-valid
 WARC-Segment-Number: 2
 Content-Length: 0
 
diff --git a/test/test_tests.py b/test/test_tests.py
index dab1e669..723b2bd9 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -1,5 +1,3 @@
-import six
-
 from warcio.cli import main
 from warcio.utils import to_native_str
 import warcio.tester
@@ -32,28 +30,6 @@ def remove_before_test_data(s):
     return ret
 
 
-def test_torture_missing():
-    files = ['standard-torture-missing.warc']
-    files = [get_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/standard-torture-missing.warc
-  WARC-Record-ID None
-    WARC-Type warcinfo
-    digest not present
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-    recommendation: warcinfo Content-Type of application/warc-fields, saw none
-"""
-
-    value = helper(args, 0)
-    assert remove_before_test_data(value) == expected
-
-
 def test_torture_validate_record():
     files = ['standard-torture-validate-record.warc']
     files = [get_test_file(filename) for filename in files]
@@ -75,110 +51,117 @@ def test_torture_validate_record():
     comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
     comment: Missing field-name : in warc-fields line: no colon
     comment: invalid warc-fields name: token cannot have a space
-  WARC-Record-ID None
+  WARC-Record-ID test-empty-warc-fields
     WARC-Type warcinfo
     digest not present
+    error: uri must be within <> warc-record-id test-empty-warc-fields
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     comment: warc-fields body present but empty
-  WARC-Record-ID None
+  WARC-Record-ID test-warcinfo-non-recommended-content-type
+    WARC-Type warcinfo
+    digest not present
+    error: uri must be within <> warc-record-id test-warcinfo-non-recommended-content-type
+    error: missing required header WARC-Date
+    recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields
+  WARC-Record-ID test-response-content-type
     WARC-Type response
     digest not present
+    error: uri must be within <> warc-record-id test-response-content-type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https responses
-  WARC-Record-ID None
+  WARC-Record-ID test-resource-dns-content-type
     WARC-Type resource
     digest not present
+    error: uri must be within <> warc-record-id test-resource-dns-content-type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: recource records for dns: shall have Content-Type of text/dns, saw text/plain
-  WARC-Record-ID None
+  WARC-Record-ID test-resource-dns-empty
     WARC-Type resource
     digest not present
+    error: uri must be within <> warc-record-id test-resource-dns-empty
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-  WARC-Record-ID None
+    comment: unknown field, no validation performed WARC-Test-TODO add another with valid block
+  WARC-Record-ID test-resource-not-dns
     WARC-Type resource
     digest not present
+    error: uri must be within <> warc-record-id test-resource-not-dns
     error: missing required header Content-Type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-  WARC-Record-ID None
+  WARC-Record-ID test-request-unrecommended-content-type
     WARC-Type request
     digest not present
+    error: uri must be within <> warc-record-id test-request-unrecommended-content-type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https requests
-  WARC-Record-ID None
+  WARC-Record-ID test-request-unrecommended-content-type-with-ip
     WARC-Type request
     digest not present
+    error: uri must be within <> warc-record-id test-request-unrecommended-content-type-with-ip
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
-  WARC-Record-ID None
+  WARC-Record-ID test-metadata-warc-fields-empty
     WARC-Type metadata
     digest not present
+    error: uri must be within <> warc-record-id test-metadata-warc-fields-empty
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     comment: warc-fields body present but empty
-  WARC-Record-ID None
+  WARC-Record-ID test-metadata-not-warc-fields
     WARC-Type metadata
     digest not present
+    error: uri must be within <> warc-record-id test-metadata-not-warc-fields
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-  WARC-Record-ID None
+  WARC-Record-ID test-revisit-profile-unknown
     WARC-Type revisit
     digest not present
+    error: uri must be within <> warc-record-id test-revisit-profile-unknown
     error: missing required header Content-Type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Target-URI
     comment: extension seen warc-profile none
     comment: no revisit details validation done due to unknown profile
-  WARC-Record-ID None
+  WARC-Record-ID test-revisit-profile-future
     WARC-Type revisit
     digest not present
+    error: uri must be within <> warc-record-id test-revisit-profile-future
     error: missing required header Content-Type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Target-URI
     error: missing required header WARC-Payload-Digest
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
     recommendation: missing recommended header WARC-Refers-To-Target-URI
     comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
-  WARC-Record-ID None
+  WARC-Record-ID test-revisit-profile-good
     WARC-Type revisit
     digest not present
+    error: uri must be within <> warc-record-id test-revisit-profile-good
     error: missing required header Content-Type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Target-URI
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
-  WARC-Record-ID None
+  WARC-Record-ID test-conversion
     WARC-Type conversion
     digest not present
+    error: uri must be within <> warc-record-id test-conversion
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Target-URI
-  WARC-Record-ID None
+  WARC-Record-ID test-continuation-segment-1
     WARC-Type continuation
     digest not present
+    error: uri must be within <> warc-record-id test-continuation-segment-1
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Segment-Origin-ID
     error: missing required header WARC-Target-URI
     error: continuation record must have WARC-Segment-Number > 1, saw 1
     comment: warcio test continuation code has not been tested, expect bugs
-  WARC-Record-ID None
+  WARC-Record-ID test-continuation-segment-valid
     WARC-Type continuation
     digest not present
+    error: uri must be within <> warc-record-id test-continuation-segment-valid
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Segment-Origin-ID
     error: missing required header WARC-Target-URI
     comment: warcio test continuation code has not been tested, expect bugs
diff --git a/warcio/tester.py b/warcio/tester.py
index f00479ff..e9755c8c 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -125,7 +125,7 @@ def validate_warc_fields(record, commentary):
 def validate_warcinfo(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
-        commentary.recommendation('warcinfo Content-Type of application/warc-fields, saw', content_type)
+        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw', content_type)
     else:
         #   format: warc-fields
         #   allowable fields include but not limited to DMCI plus the following

From c704fe9886245204158edc89e32da40799e5eaa1 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 12:19:35 -0800
Subject: [PATCH 16/68] preserve capitalization in messages

---
 test/test_tests.py | 142 ++++++++++++++++++++++-----------------------
 warcio/tester.py   |  15 +++--
 2 files changed, 78 insertions(+), 79 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index 723b2bd9..c922eff1 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -42,7 +42,7 @@ def test_torture_validate_record():
   WARC-Record-ID None
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> warc-refers-to probhibited
+    error: uri must be within <> WARC-Refers-To probhibited
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
     error: field not allowed in record_type WARC-Refers-To warcinfo
@@ -54,77 +54,77 @@ def test_torture_validate_record():
   WARC-Record-ID test-empty-warc-fields
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> warc-record-id test-empty-warc-fields
+    error: uri must be within <> WARC-Record-ID test-empty-warc-fields
     error: missing required header WARC-Date
     comment: warc-fields body present but empty
   WARC-Record-ID test-warcinfo-non-recommended-content-type
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> warc-record-id test-warcinfo-non-recommended-content-type
+    error: uri must be within <> WARC-Record-ID test-warcinfo-non-recommended-content-type
     error: missing required header WARC-Date
     recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields
   WARC-Record-ID test-response-content-type
     WARC-Type response
     digest not present
-    error: uri must be within <> warc-record-id test-response-content-type
+    error: uri must be within <> WARC-Record-ID test-response-content-type
     error: missing required header WARC-Date
     error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https responses
   WARC-Record-ID test-resource-dns-content-type
     WARC-Type resource
     digest not present
-    error: uri must be within <> warc-record-id test-resource-dns-content-type
+    error: uri must be within <> WARC-Record-ID test-resource-dns-content-type
     error: missing required header WARC-Date
     error: recource records for dns: shall have Content-Type of text/dns, saw text/plain
   WARC-Record-ID test-resource-dns-empty
     WARC-Type resource
     digest not present
-    error: uri must be within <> warc-record-id test-resource-dns-empty
+    error: uri must be within <> WARC-Record-ID test-resource-dns-empty
     error: missing required header WARC-Date
     comment: unknown field, no validation performed WARC-Test-TODO add another with valid block
   WARC-Record-ID test-resource-not-dns
     WARC-Type resource
     digest not present
-    error: uri must be within <> warc-record-id test-resource-not-dns
+    error: uri must be within <> WARC-Record-ID test-resource-not-dns
     error: missing required header Content-Type
     error: missing required header WARC-Date
   WARC-Record-ID test-request-unrecommended-content-type
     WARC-Type request
     digest not present
-    error: uri must be within <> warc-record-id test-request-unrecommended-content-type
+    error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type
     error: missing required header WARC-Date
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https requests
   WARC-Record-ID test-request-unrecommended-content-type-with-ip
     WARC-Type request
     digest not present
-    error: uri must be within <> warc-record-id test-request-unrecommended-content-type-with-ip
+    error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type-with-ip
     error: missing required header WARC-Date
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
   WARC-Record-ID test-metadata-warc-fields-empty
     WARC-Type metadata
     digest not present
-    error: uri must be within <> warc-record-id test-metadata-warc-fields-empty
+    error: uri must be within <> WARC-Record-ID test-metadata-warc-fields-empty
     error: missing required header WARC-Date
     comment: warc-fields body present but empty
   WARC-Record-ID test-metadata-not-warc-fields
     WARC-Type metadata
     digest not present
-    error: uri must be within <> warc-record-id test-metadata-not-warc-fields
+    error: uri must be within <> WARC-Record-ID test-metadata-not-warc-fields
     error: missing required header WARC-Date
   WARC-Record-ID test-revisit-profile-unknown
     WARC-Type revisit
     digest not present
-    error: uri must be within <> warc-record-id test-revisit-profile-unknown
+    error: uri must be within <> WARC-Record-ID test-revisit-profile-unknown
     error: missing required header Content-Type
     error: missing required header WARC-Date
     error: missing required header WARC-Target-URI
-    comment: extension seen warc-profile none
+    comment: extension seen WARC-Profile none
     comment: no revisit details validation done due to unknown profile
   WARC-Record-ID test-revisit-profile-future
     WARC-Type revisit
     digest not present
-    error: uri must be within <> warc-record-id test-revisit-profile-future
+    error: uri must be within <> WARC-Record-ID test-revisit-profile-future
     error: missing required header Content-Type
     error: missing required header WARC-Date
     error: missing required header WARC-Target-URI
@@ -132,11 +132,11 @@ def test_torture_validate_record():
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
     recommendation: missing recommended header WARC-Refers-To-Target-URI
-    comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+    comment: extension seen WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
   WARC-Record-ID test-revisit-profile-good
     WARC-Type revisit
     digest not present
-    error: uri must be within <> warc-record-id test-revisit-profile-good
+    error: uri must be within <> WARC-Record-ID test-revisit-profile-good
     error: missing required header Content-Type
     error: missing required header WARC-Date
     error: missing required header WARC-Target-URI
@@ -145,13 +145,13 @@ def test_torture_validate_record():
   WARC-Record-ID test-conversion
     WARC-Type conversion
     digest not present
-    error: uri must be within <> warc-record-id test-conversion
+    error: uri must be within <> WARC-Record-ID test-conversion
     error: missing required header WARC-Date
     error: missing required header WARC-Target-URI
   WARC-Record-ID test-continuation-segment-1
     WARC-Type continuation
     digest not present
-    error: uri must be within <> warc-record-id test-continuation-segment-1
+    error: uri must be within <> WARC-Record-ID test-continuation-segment-1
     error: missing required header WARC-Date
     error: missing required header WARC-Segment-Origin-ID
     error: missing required header WARC-Target-URI
@@ -160,7 +160,7 @@ def test_torture_validate_record():
   WARC-Record-ID test-continuation-segment-valid
     WARC-Type continuation
     digest not present
-    error: uri must be within <> warc-record-id test-continuation-segment-valid
+    error: uri must be within <> WARC-Record-ID test-continuation-segment-valid
     error: missing required header WARC-Date
     error: missing required header WARC-Segment-Origin-ID
     error: missing required header WARC-Target-URI
@@ -187,64 +187,64 @@ def test_torture_validate_field():
   WARC-Record-ID <foo:bar>
     WARC-Type does-not-exist
     unknown hash algorithm name in block digest
-    error: uri must not be within <> warc-target-uri <http://example.com/>
-    error: invalid uri scheme, bad character warc-target-uri <http://example.com/>
-    error: duplicate field seen warc-target-uri example.com
-    error: invalid uri, no scheme warc-target-uri example.com
-    error: duplicate field seen warc-target-uri ex ample.com
-    error: invalid uri, no scheme warc-target-uri ex ample.com
-    error: invalid uri, contains whitespace warc-target-uri ex ample.com
-    error: invalid uri scheme, bad character warc-target-uri ex ample.com
-    error: duplicate field seen warc-target-uri h<>ttp://example.com/
-    error: invalid uri scheme, bad character warc-target-uri h<>ttp://example.com/
-    error: duplicate field seen warc-type CAPITALIZED
-    error: uri must be within <> warc-concurrent-to http://example.com/
-    error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
-    error: WARC 1.0 may not have fractional seconds warc-date 2017-03-06T04:03:53.Z
-    error: must contain a / content-type asdf
-    error: invalid subtype content-type asdf
-    error: duplicate field seen content-type has space/asdf
-    error: invalid type content-type has space/asdf
-    error: duplicate field seen content-type asdf/has space
-    error: invalid subtype content-type asdf/has space
-    error: duplicate field seen content-type asdf/has space;asdf
-    error: invalid subtype content-type asdf/has space;asdf
-    error: missing algorithm warc-block-digest asdf
-    error: duplicate field seen warc-block-digest has space:asdf
-    error: invalid algorithm warc-block-digest has space:asdf
-    error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^
-    error: invalid ip warc-ip-address 1.2.3.4.5
-    error: uri must be within <> warc-warcinfo-id asdf:asdf
-    error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
-    error: must contain a / warc-identified-payload-type asdf
-    error: invalid subtype warc-identified-payload-type asdf
-    error: uri must be within <> warc-segment-origin-id http://example.com
-    error: must be an integer warc-segment-number not-an-integer
-    error: duplicate field seen warc-segment-number 0
-    error: must be 1 or greater warc-segment-number 0
-    error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 0
-    error: duplicate field seen warc-segment-number 1
-    error: duplicate field seen warc-segment-number 2
-    error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 2
-    error: duplicate field seen warc-segment-total-length not-an-integer
-    error: must be an integer warc-segment-total-length not-an-integer
-    comment: unknown WARC-Type warc-type does-not-exist
-    comment: WARC-Type is not lower-case warc-type CAPITALIZED
-    comment: unknown WARC-Type warc-type CAPITALIZED
-    comment: unknown digest algorithm warc-block-digest asdf
-    comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^
-    comment: extension seen warc-truncated invalid
-    comment: extension seen warc-profile asdf
+    error: uri must not be within <> WARC-Target-URI <http://example.com/>
+    error: invalid uri scheme, bad character WARC-Target-URI <http://example.com/>
+    error: duplicate field seen WARC-Target-URI example.com
+    error: invalid uri, no scheme WARC-Target-URI example.com
+    error: duplicate field seen WARC-Target-URI ex ample.com
+    error: invalid uri, no scheme WARC-Target-URI ex ample.com
+    error: invalid uri, contains whitespace WARC-Target-URI ex ample.com
+    error: invalid uri scheme, bad character WARC-Target-URI ex ample.com
+    error: duplicate field seen WARC-Target-URI h<>ttp://example.com/
+    error: invalid uri scheme, bad character WARC-Target-URI h<>ttp://example.com/
+    error: duplicate field seen WARC-Type CAPITALIZED
+    error: uri must be within <> WARC-Concurrent-To http://example.com/
+    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z
+    error: WARC 1.0 may not have fractional seconds WARC-Date 2017-03-06T04:03:53.Z
+    error: must contain a / Content-Type asdf
+    error: invalid subtype Content-Type asdf
+    error: duplicate field seen Content-Type has space/asdf
+    error: invalid type Content-Type has space/asdf
+    error: duplicate field seen Content-Type asdf/has space
+    error: invalid subtype Content-Type asdf/has space
+    error: duplicate field seen Content-Type asdf/has space;asdf
+    error: invalid subtype Content-Type asdf/has space;asdf
+    error: missing algorithm WARC-Block-Digest asdf
+    error: duplicate field seen WARC-Block-Digest has space:asdf
+    error: invalid algorithm WARC-Block-Digest has space:asdf
+    error: duplicate field seen WARC-Block-Digest sha1:&$*^&*^#*&^
+    error: invalid ip WARC-IP-Address 1.2.3.4.5
+    error: uri must be within <> WARC-Warcinfo-ID asdf:asdf
+    error: duplicate field seen WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    error: must contain a / WARC-Identified-Payload-Type asdf
+    error: invalid subtype WARC-Identified-Payload-Type asdf
+    error: uri must be within <> WARC-Segment-Origin-ID http://example.com
+    error: must be an integer WARC-Segment-Number not-an-integer
+    error: duplicate field seen WARC-Segment-Number 0
+    error: must be 1 or greater WARC-Segment-Number 0
+    error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 0
+    error: duplicate field seen WARC-Segment-Number 1
+    error: duplicate field seen WARC-Segment-Number 2
+    error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 2
+    error: duplicate field seen WARC-Segment-Total-Length not-an-integer
+    error: must be an integer WARC-Segment-Total-Length not-an-integer
+    comment: unknown WARC-Type WARC-Type does-not-exist
+    comment: WARC-Type is not lower-case WARC-Type CAPITALIZED
+    comment: unknown WARC-Type WARC-Type CAPITALIZED
+    comment: unknown digest algorithm WARC-Block-Digest asdf
+    comment: Invalid-looking digest value WARC-Block-Digest sha1:&$*^&*^#*&^
+    comment: extension seen WARC-Truncated invalid
+    comment: extension seen WARC-Profile asdf
     comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0
     comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0
     comment: unknown field, no validation performed WARC-Unknown-Field asdf
   WARC-Record-ID None
     WARC-Type invalid
     digest not present
-    error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
-    error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z
-    error: duplicate field seen warc-date 2017-03-06T04:03:53.0Z
-    comment: unknown WARC-Type warc-type invalid
+    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z
+    error: fractional seconds must have 1-9 digits WARC-Date 2017-03-06T04:03:53.Z
+    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.0Z
+    comment: unknown WARC-Type WARC-Type invalid
   WARC-Record-ID None
     WARC-Type request
     digest not present
diff --git a/warcio/tester.py b/warcio/tester.py
index e9755c8c..2300d062 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -603,19 +603,18 @@ def validate_record(record):
 
     seen_fields = set()
     for field, value in record.rec_headers.headers:
-        field_case = field
-        field = field.lower()
-        if field != 'warc-concurrent-to' and field in seen_fields:
+        field_l = field.lower()
+        if field != 'warc-concurrent-to' and field_l in seen_fields:
             commentary.error('duplicate field seen', field, value)
-        seen_fields.add(field)
-        if field not in warc_fields:
-            commentary.comment('unknown field, no validation performed', field_case, value)
+        seen_fields.add(field_l)
+        if field_l not in warc_fields:
+            commentary.comment('unknown field, no validation performed', field, value)
             continue
-        config = warc_fields[field]
+        config = warc_fields[field_l]
         if 'minver' in config:
             if version < config['minver']:
                 # unknown fields are extensions, so this is a comment and not an error
-                commentary.comment('field was introduced after this warc version', field_case, value, version)
+                commentary.comment('field was introduced after this warc version', field, value, version)
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 

From 3839fa16bcf1ec58053379e4b695314d72e9afd6 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 12:55:48 -0800
Subject: [PATCH 17/68] capitals and colons

---
 test/test_tests.py | 264 ++++++++++++++++++++++-----------------------
 warcio/tester.py   |  88 +++++++--------
 2 files changed, 177 insertions(+), 175 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index c922eff1..91eba656 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -42,128 +42,128 @@ def test_torture_validate_record():
   WARC-Record-ID None
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> WARC-Refers-To probhibited
-    error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-    error: field not allowed in record_type WARC-Refers-To warcinfo
+    error: uri must be within <>: WARC-Refers-To probhibited
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Record-ID
+    error: field not allowed in record type: warcinfo WARC-Refers-To
     error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
     comment: The first line of warc-fields cannot start with whitespace
     comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
-    comment: Missing field-name : in warc-fields line: no colon
-    comment: invalid warc-fields name: token cannot have a space
+    comment: Missing colon in warc-fields line: no colon
+    comment: Invalid warc-fields name: token cannot have a space
   WARC-Record-ID test-empty-warc-fields
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> WARC-Record-ID test-empty-warc-fields
-    error: missing required header WARC-Date
+    error: uri must be within <>: WARC-Record-ID test-empty-warc-fields
+    error: missing required header: WARC-Date
     comment: warc-fields body present but empty
   WARC-Record-ID test-warcinfo-non-recommended-content-type
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> WARC-Record-ID test-warcinfo-non-recommended-content-type
-    error: missing required header WARC-Date
-    recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields
+    error: uri must be within <>: WARC-Record-ID test-warcinfo-non-recommended-content-type
+    error: missing required header: WARC-Date
+    recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw: not-application/warc-fields
   WARC-Record-ID test-response-content-type
     WARC-Type response
     digest not present
-    error: uri must be within <> WARC-Record-ID test-response-content-type
-    error: missing required header WARC-Date
-    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain
+    error: uri must be within <>: WARC-Record-ID test-response-content-type
+    error: missing required header: WARC-Date
+    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw: text/plain
     error: WARC-IP-Address should be used for http and https responses
   WARC-Record-ID test-resource-dns-content-type
     WARC-Type resource
     digest not present
-    error: uri must be within <> WARC-Record-ID test-resource-dns-content-type
-    error: missing required header WARC-Date
-    error: recource records for dns: shall have Content-Type of text/dns, saw text/plain
+    error: uri must be within <>: WARC-Record-ID test-resource-dns-content-type
+    error: missing required header: WARC-Date
+    error: recource records for dns: shall have Content-Type of text/dns, saw: text/plain
   WARC-Record-ID test-resource-dns-empty
     WARC-Type resource
     digest not present
-    error: uri must be within <> WARC-Record-ID test-resource-dns-empty
-    error: missing required header WARC-Date
-    comment: unknown field, no validation performed WARC-Test-TODO add another with valid block
+    error: uri must be within <>: WARC-Record-ID test-resource-dns-empty
+    error: missing required header: WARC-Date
+    comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block
   WARC-Record-ID test-resource-not-dns
     WARC-Type resource
     digest not present
-    error: uri must be within <> WARC-Record-ID test-resource-not-dns
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
+    error: uri must be within <>: WARC-Record-ID test-resource-not-dns
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
   WARC-Record-ID test-request-unrecommended-content-type
     WARC-Type request
     digest not present
-    error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type
-    error: missing required header WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
+    error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type
+    error: missing required header: WARC-Date
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain
     error: WARC-IP-Address should be used for http and https requests
   WARC-Record-ID test-request-unrecommended-content-type-with-ip
     WARC-Type request
     digest not present
-    error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type-with-ip
-    error: missing required header WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
+    error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type-with-ip
+    error: missing required header: WARC-Date
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain
   WARC-Record-ID test-metadata-warc-fields-empty
     WARC-Type metadata
     digest not present
-    error: uri must be within <> WARC-Record-ID test-metadata-warc-fields-empty
-    error: missing required header WARC-Date
+    error: uri must be within <>: WARC-Record-ID test-metadata-warc-fields-empty
+    error: missing required header: WARC-Date
     comment: warc-fields body present but empty
   WARC-Record-ID test-metadata-not-warc-fields
     WARC-Type metadata
     digest not present
-    error: uri must be within <> WARC-Record-ID test-metadata-not-warc-fields
-    error: missing required header WARC-Date
+    error: uri must be within <>: WARC-Record-ID test-metadata-not-warc-fields
+    error: missing required header: WARC-Date
   WARC-Record-ID test-revisit-profile-unknown
     WARC-Type revisit
     digest not present
-    error: uri must be within <> WARC-Record-ID test-revisit-profile-unknown
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Target-URI
-    comment: extension seen WARC-Profile none
-    comment: no revisit details validation done due to unknown profile
+    error: uri must be within <>: WARC-Record-ID test-revisit-profile-unknown
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Target-URI
+    comment: extension seen: WARC-Profile none
+    comment: no revisit details validation done due to unknown profile: none
   WARC-Record-ID test-revisit-profile-future
     WARC-Type revisit
     digest not present
-    error: uri must be within <> WARC-Record-ID test-revisit-profile-future
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Target-URI
-    error: missing required header WARC-Payload-Digest
-    recommendation: missing recommended header WARC-Refers-To
-    recommendation: missing recommended header WARC-Refers-To-Date
-    recommendation: missing recommended header WARC-Refers-To-Target-URI
-    comment: extension seen WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+    error: uri must be within <>: WARC-Record-ID test-revisit-profile-future
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Target-URI
+    error: missing required header: WARC-Payload-Digest
+    recommendation: missing recommended header: WARC-Refers-To
+    recommendation: missing recommended header: WARC-Refers-To-Date
+    recommendation: missing recommended header: WARC-Refers-To-Target-URI
+    comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
   WARC-Record-ID test-revisit-profile-good
     WARC-Type revisit
     digest not present
-    error: uri must be within <> WARC-Record-ID test-revisit-profile-good
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Target-URI
-    recommendation: missing recommended header WARC-Refers-To
-    recommendation: missing recommended header WARC-Refers-To-Date
+    error: uri must be within <>: WARC-Record-ID test-revisit-profile-good
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Target-URI
+    recommendation: missing recommended header: WARC-Refers-To
+    recommendation: missing recommended header: WARC-Refers-To-Date
   WARC-Record-ID test-conversion
     WARC-Type conversion
     digest not present
-    error: uri must be within <> WARC-Record-ID test-conversion
-    error: missing required header WARC-Date
-    error: missing required header WARC-Target-URI
+    error: uri must be within <>: WARC-Record-ID test-conversion
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Target-URI
   WARC-Record-ID test-continuation-segment-1
     WARC-Type continuation
     digest not present
-    error: uri must be within <> WARC-Record-ID test-continuation-segment-1
-    error: missing required header WARC-Date
-    error: missing required header WARC-Segment-Origin-ID
-    error: missing required header WARC-Target-URI
-    error: continuation record must have WARC-Segment-Number > 1, saw 1
+    error: uri must be within <>: WARC-Record-ID test-continuation-segment-1
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Segment-Origin-ID
+    error: missing required header: WARC-Target-URI
+    error: continuation record must have WARC-Segment-Number > 1, saw: 1
     comment: warcio test continuation code has not been tested, expect bugs
   WARC-Record-ID test-continuation-segment-valid
     WARC-Type continuation
     digest not present
-    error: uri must be within <> WARC-Record-ID test-continuation-segment-valid
-    error: missing required header WARC-Date
-    error: missing required header WARC-Segment-Origin-ID
-    error: missing required header WARC-Target-URI
+    error: uri must be within <>: WARC-Record-ID test-continuation-segment-valid
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Segment-Origin-ID
+    error: missing required header: WARC-Target-URI
     comment: warcio test continuation code has not been tested, expect bugs
 """
 
@@ -187,73 +187,73 @@ def test_torture_validate_field():
   WARC-Record-ID <foo:bar>
     WARC-Type does-not-exist
     unknown hash algorithm name in block digest
-    error: uri must not be within <> WARC-Target-URI <http://example.com/>
-    error: invalid uri scheme, bad character WARC-Target-URI <http://example.com/>
-    error: duplicate field seen WARC-Target-URI example.com
-    error: invalid uri, no scheme WARC-Target-URI example.com
-    error: duplicate field seen WARC-Target-URI ex ample.com
-    error: invalid uri, no scheme WARC-Target-URI ex ample.com
-    error: invalid uri, contains whitespace WARC-Target-URI ex ample.com
-    error: invalid uri scheme, bad character WARC-Target-URI ex ample.com
-    error: duplicate field seen WARC-Target-URI h<>ttp://example.com/
-    error: invalid uri scheme, bad character WARC-Target-URI h<>ttp://example.com/
-    error: duplicate field seen WARC-Type CAPITALIZED
-    error: uri must be within <> WARC-Concurrent-To http://example.com/
-    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z
-    error: WARC 1.0 may not have fractional seconds WARC-Date 2017-03-06T04:03:53.Z
-    error: must contain a / Content-Type asdf
-    error: invalid subtype Content-Type asdf
-    error: duplicate field seen Content-Type has space/asdf
-    error: invalid type Content-Type has space/asdf
-    error: duplicate field seen Content-Type asdf/has space
-    error: invalid subtype Content-Type asdf/has space
-    error: duplicate field seen Content-Type asdf/has space;asdf
-    error: invalid subtype Content-Type asdf/has space;asdf
-    error: missing algorithm WARC-Block-Digest asdf
-    error: duplicate field seen WARC-Block-Digest has space:asdf
-    error: invalid algorithm WARC-Block-Digest has space:asdf
-    error: duplicate field seen WARC-Block-Digest sha1:&$*^&*^#*&^
-    error: invalid ip WARC-IP-Address 1.2.3.4.5
-    error: uri must be within <> WARC-Warcinfo-ID asdf:asdf
-    error: duplicate field seen WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
-    error: must contain a / WARC-Identified-Payload-Type asdf
-    error: invalid subtype WARC-Identified-Payload-Type asdf
-    error: uri must be within <> WARC-Segment-Origin-ID http://example.com
-    error: must be an integer WARC-Segment-Number not-an-integer
-    error: duplicate field seen WARC-Segment-Number 0
-    error: must be 1 or greater WARC-Segment-Number 0
-    error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 0
-    error: duplicate field seen WARC-Segment-Number 1
-    error: duplicate field seen WARC-Segment-Number 2
-    error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 2
-    error: duplicate field seen WARC-Segment-Total-Length not-an-integer
-    error: must be an integer WARC-Segment-Total-Length not-an-integer
-    comment: unknown WARC-Type WARC-Type does-not-exist
-    comment: WARC-Type is not lower-case WARC-Type CAPITALIZED
-    comment: unknown WARC-Type WARC-Type CAPITALIZED
-    comment: unknown digest algorithm WARC-Block-Digest asdf
-    comment: Invalid-looking digest value WARC-Block-Digest sha1:&$*^&*^#*&^
-    comment: extension seen WARC-Truncated invalid
-    comment: extension seen WARC-Profile asdf
-    comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0
-    comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0
-    comment: unknown field, no validation performed WARC-Unknown-Field asdf
+    error: uri must not be within <>: WARC-Target-URI <http://example.com/>
+    error: invalid uri scheme, bad character: WARC-Target-URI <http://example.com/>
+    error: duplicate field seen: WARC-Target-URI example.com
+    error: invalid uri, no scheme: WARC-Target-URI example.com
+    error: duplicate field seen: WARC-Target-URI ex ample.com
+    error: invalid uri, no scheme: WARC-Target-URI ex ample.com
+    error: invalid uri, contains whitespace: WARC-Target-URI ex ample.com
+    error: invalid uri scheme, bad character: WARC-Target-URI ex ample.com
+    error: duplicate field seen: WARC-Target-URI h<>ttp://example.com/
+    error: invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/
+    error: duplicate field seen: WARC-Type CAPITALIZED
+    error: uri must be within <>: WARC-Concurrent-To http://example.com/
+    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
+    error: WARC 1.0 time may not have fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
+    error: must contain a /: Content-Type asdf
+    error: invalid subtype: Content-Type asdf
+    error: duplicate field seen: Content-Type has space/asdf
+    error: invalid type: Content-Type has space/asdf
+    error: duplicate field seen: Content-Type asdf/has space
+    error: invalid subtype: Content-Type asdf/has space
+    error: duplicate field seen: Content-Type asdf/has space;asdf
+    error: invalid subtype: Content-Type asdf/has space;asdf
+    error: missing algorithm: WARC-Block-Digest asdf
+    error: duplicate field seen: WARC-Block-Digest has space:asdf
+    error: invalid algorithm: WARC-Block-Digest has space:asdf
+    error: duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^
+    error: invalid ip: WARC-IP-Address 1.2.3.4.5
+    error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf
+    error: duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    error: must contain a /: WARC-Identified-Payload-Type asdf
+    error: invalid subtype: WARC-Identified-Payload-Type asdf
+    error: uri must be within <>: WARC-Segment-Origin-ID http://example.com
+    error: must be an integer: WARC-Segment-Number not-an-integer
+    error: duplicate field seen: WARC-Segment-Number 0
+    error: must be 1 or greater: WARC-Segment-Number 0
+    error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0
+    error: duplicate field seen: WARC-Segment-Number 1
+    error: duplicate field seen: WARC-Segment-Number 2
+    error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2
+    error: duplicate field seen: WARC-Segment-Total-Length not-an-integer
+    error: must be an integer: WARC-Segment-Total-Length not-an-integer
+    comment: unknown WARC-Type: WARC-Type does-not-exist
+    comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED
+    comment: unknown WARC-Type: WARC-Type CAPITALIZED
+    comment: unknown digest algorithm: WARC-Block-Digest asdf
+    comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^
+    comment: extension seen: WARC-Truncated invalid
+    comment: extension seen: WARC-Profile asdf
+    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com
+    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date
+    comment: unknown field, no validation performed: WARC-Unknown-Field asdf
   WARC-Record-ID None
     WARC-Type invalid
     digest not present
-    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z
-    error: fractional seconds must have 1-9 digits WARC-Date 2017-03-06T04:03:53.Z
-    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.0Z
-    comment: unknown WARC-Type WARC-Type invalid
+    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
+    error: fractional seconds must have 1-9 digits: WARC-Date 2017-03-06T04:03:53.Z
+    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z
+    comment: unknown WARC-Type: WARC-Type invalid
   WARC-Record-ID None
     WARC-Type request
     digest not present
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-    error: missing required header WARC-Target-URI
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Record-ID
+    error: missing required header: WARC-Target-URI
     recommendation: do not segment WARC-Type request
-    comment: no configuration seen for WARC-Segment-Number request
+    comment: Unknown field for this record type, perhaps an extension: request WARC-Segment-Number
 """
 
     value = helper(args, 0)
@@ -312,9 +312,9 @@ def test_digests():
   WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
     WARC-Type revisit
     digest present but not checked
-    recommendation: missing recommended header WARC-Refers-To
-    comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com/ 1.0
-    comment: field was introduced after this warc version WARC-Refers-To-Date 2017-03-06T04:02:06Z 1.0
+    recommendation: missing recommended header: WARC-Refers-To
+    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
+    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
   WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest not present
@@ -330,14 +330,14 @@ def test_leftovers():
     assert not commentary.has_comments()
 
     # hard to test because invalid WARC Content-Length raises in archiveiterator
-    warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None)
+    warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None)
 
     # hard to test because warcio checks the WARC version
     warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
 
     expected = '''\
-error: must be an integer content-length not-an-integer
-comment: no profile check because unknown warc version blah blah
+error: must be an integer: Content-Length not-an-integer
+comment: no profile check because unknown warc version: blah blah
 '''
 
     assert '\n'.join(commentary.comments())+'\n' == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index 2300d062..4ee05f1f 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -59,7 +59,8 @@ def __getattr__(self, name):
 
 
 def canon_content_type(s):
-    return s.lower().replace('; ', ';')
+    # wget omits the space after the ;, let that pass
+    return s.lower().replace(';msgtype=', '; msgtype=')
 
 
 def validate_warc_fields(record, commentary):
@@ -106,11 +107,11 @@ def validate_warc_fields(record, commentary):
         else:
             # check for field-name :
             if ':' not in line:
-                commentary.comment('Missing field-name : in warc-fields line:', line)
+                commentary.comment('Missing colon in warc-fields line:', line)
             else:
                 field_name = line.split(':', 1)[0]
                 if not re.search(token_re, field_name):
-                    commentary.comment('invalid warc-fields name:', field_name)
+                    commentary.comment('Invalid warc-fields name:', field_name)
                 else:
                     lines.append(line)
         first_line = False
@@ -125,7 +126,7 @@ def validate_warc_fields(record, commentary):
 def validate_warcinfo(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
-        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw', content_type)
+        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw:', content_type)
     else:
         #   format: warc-fields
         #   allowable fields include but not limited to DMCI plus the following
@@ -145,8 +146,8 @@ def validate_response(record, commentary, pending):
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
-        if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}:
-            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw', content_type)
+        if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}:
+            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
@@ -163,7 +164,7 @@ def validate_resource(record, commentary, pending):
     if target_uri.startswith('dns:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
         if content_type.lower() != 'text/dns':
-            commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type)
+            commentary.error('recource records for dns: shall have Content-Type of text/dns, saw:', content_type)
         else:
             # rfc 2540 and rfc 1035
             #validate_text_dns()
@@ -178,8 +179,8 @@ def validate_request(record, commentary, pending):
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type')
 
-        if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}:
-            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw', content_type)
+        if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}:
+            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https requests')
@@ -225,7 +226,7 @@ def validate_revisit(record, commentary, pending):
         #     if yes, should be like a response record, truncated if desired
         #   WARC-Refers-To-Date should be the same as WARC-Date in the original record if present
     else:
-        commentary.comment('no revisit details validation done due to unknown profile')
+        commentary.comment('no revisit details validation done due to unknown profile:', warc_profile)
 
 
 def validate_conversion(record, commentary, pending):
@@ -239,7 +240,7 @@ def validate_continuation(record, commentary, pending):
 
     segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none')
     if segment_number.isdigit() and int(segment_number) < 2:
-        commentary.error('continuation record must have WARC-Segment-Number > 1, saw', segment_number)
+        commentary.error('continuation record must have WARC-Segment-Number > 1, saw:', segment_number)
 
     # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated
 
@@ -251,30 +252,30 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
     # schemes are case-insensitive and normalize to lower
     if value.startswith('<') or value.endswith('>'):
         # wget 1.19 bug caused by WARC 1.0 spec error
-        commentary.error('uri must not be within <>', field, value)
+        commentary.error('uri must not be within <>:', field, value)
     if ':' not in value:
-        commentary.error('invalid uri, no scheme', field, value)
+        commentary.error('invalid uri, no scheme:', field, value)
     if re.search(r'\s', value):
-        commentary.error('invalid uri, contains whitespace', field, value)
+        commentary.error('invalid uri, contains whitespace:', field, value)
     scheme = value.split(':', 1)[0]
     if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
-        commentary.error('invalid uri scheme, bad character', field, value)
+        commentary.error('invalid uri scheme, bad character:', field, value)
     # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
 
 
 def validate_warc_type(field, value, record, version, commentary, pending):
     if not value.islower():
         # I am unclear if this is allowed? standard is silent
-        commentary.comment('WARC-Type is not lower-case', field, value)
+        commentary.comment('WARC-Type is not lower-case:', field, value)
     if value.lower() not in record_types:
         # standard says readers should ignore unknown warc-types
-        commentary.comment('unknown WARC-Type', field, value)
+        commentary.comment('unknown WARC-Type:', field, value)
 
 
 def validate_uri(field, value, record, version, commentary, pending):
     # < uri >
     if not (value.startswith('<') and value.endswith('>')):
-        commentary.error('uri must be within <>', field, value)
+        commentary.error('uri must be within <>:', field, value)
         return
     validate_actual_uri(field, value[1:-1], record, version, commentary, pending)
 
@@ -289,12 +290,12 @@ def validate_timestamp(field, value, record, version, commentary, pending):
     if not use_ms:
         if '.' in value:
             # XXX specification infelicity: would be nice to have 'advice to implementers' here
-            commentary.error('WARC 1.0 may not have fractional seconds', field, value)
+            commentary.error('WARC 1.0 time may not have fractional seconds:', field, value)
     else:
         if '.' in value:
             start, end = value.split('.', 1)
             if not re.search(r'\A[0-9]{1,9}Z\Z', end):
-                commentary.error('fractional seconds must have 1-9 digits', field, value)
+                commentary.error('fractional seconds must have 1-9 digits:', field, value)
 
     # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
 
@@ -304,7 +305,7 @@ def validate_timestamp(field, value, record, version, commentary, pending):
 
 def validate_content_length(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer', field, value)
+        commentary.error('must be an integer:', field, value)
 
 
 token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z'
@@ -313,7 +314,7 @@ def validate_content_length(field, value, record, version, commentary, pending):
 
 def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
-        commentary.error('must contain a /', field, value)
+        commentary.error('must contain a /:', field, value)
     splits = value.split('/', 1)
     ctype = splits[0]
     if len(splits) > 1:
@@ -321,13 +322,13 @@ def validate_content_type(field, value, record, version, commentary, pending):
     else:
         rest = ''
     if not re.search(token_re, ctype):
-        commentary.error('invalid type', field, value)
+        commentary.error('invalid type:', field, value)
     if ';' in rest:
         subtype, rest = rest.split(';', 1)
     else:
         subtype = rest
     if not re.search(token_re, subtype):
-        commentary.error('invalid subtype', field, value)
+        commentary.error('invalid subtype:', field, value)
 
     # at this point there can be multiple parameters,
     # some of which could have quoted string values with ; in them
@@ -337,7 +338,7 @@ def validate_content_type(field, value, record, version, commentary, pending):
 
 def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
-        commentary.error('missing algorithm', field, value)
+        commentary.error('missing algorithm:', field, value)
     splits = value.split(':', 1)
     algorithm = splits[0]
     if len(splits) > 1:
@@ -345,18 +346,19 @@ def validate_digest(field, value, record, version, commentary, pending):
     else:
         digest = 'none'
     if not re.search(token_re, algorithm):
-        commentary.error('invalid algorithm', field, value)
+        commentary.error('invalid algorithm:', field, value)
     else:
         try:
             Digester(algorithm)
         except ValueError:
-            commentary.comment('unknown digest algorithm', field, value)
+            commentary.comment('unknown digest algorithm:', field, value)
     if not re.search(token_re, digest):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
         pass
     if not re.search(digest_re, digest):
-        commentary.comment('Invalid-looking digest value', field, value)
+        # suggested in https://github.com/iipc/warc-specifications/issues/48
+        commentary.comment('Invalid-looking digest value:', field, value)
 
 
 def validate_ip(field, value, record, version, commentary, pending):
@@ -366,14 +368,14 @@ def validate_ip(field, value, record, version, commentary, pending):
             value = unicode(value)
         ipaddress.ip_address(value)
     except ValueError:
-        commentary.error('invalid ip', field, value)
+        commentary.error('invalid ip:', field, value)
     except (ImportError, NameError):  # pragma: no cover
         commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 
 def validate_truncated(field, value, record, version, commentary, pending):
     if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}:
-        commentary.comment('extension seen', field, value)
+        commentary.comment('extension seen:', field, value)
 
 
 def validate_warcinfo_id(field, value, record, version, commentary, pending):
@@ -400,31 +402,31 @@ def validate_filename(field, value, record, version, commentary, pending):
 
 def validate_profile(field, value, record, version, commentary, pending):
     if version not in profiles:
-        commentary.comment('no profile check because unknown warc version', field, value)
+        commentary.comment('no profile check because unknown warc version:', field, value)
         return
     if value not in profiles[version]:
-        commentary.comment('extension seen', field, value)
+        commentary.comment('extension seen:', field, value)
 
 
 def validate_segment_number(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer', field, value)
+        commentary.error('must be an integer:', field, value)
         return
     iv = int(value)
     if iv == 0:
-        commentary.error('must be 1 or greater', field, value)
+        commentary.error('must be 1 or greater:', field, value)
 
     rec_type = record.rec_headers.get_header('WARC-Type', 'none')
     if rec_type != 'continuation':
         if iv != 1:
-            commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value)
+            commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value)
     if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
         commentary.recommendation('do not segment WARC-Type', rec_type)
 
 
 def validate_segment_total_length(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer', field, value)
+        commentary.error('must be an integer:', field, value)
 
 
 warc_fields = {
@@ -568,21 +570,21 @@ def make_header_set(config, kinds):
 def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False):
     for req in sorted(config.get('required', [])):
         if not rec_headers.get_header(req):
-            commentary.error('missing required header', req)
+            commentary.error('missing required header:', req)
     for rec in sorted(config.get('recommended', [])):
         if not rec_headers.get_header(rec):
-            commentary.recommendation('missing recommended header', rec)
+            commentary.recommendation('missing recommended header:', rec)
     allowed = make_header_set(config, ('required', 'optional', 'recommended'))
     prohibited = make_header_set(config, ('prohibited',))
 
     for field, value in rec_headers.headers:
         fl = field.lower()
         if fl in prohibited:
-            commentary.error('field not allowed in record_type', field, rec_type)
+            commentary.error('field not allowed in record type:', rec_type, field)
         elif allow_all or fl in allowed:
             pass
         elif fl in warc_fields:
-            commentary.comment('no configuration seen for', field, rec_type)
+            commentary.comment('Unknown field for this record type, perhaps an extension:', rec_type, field)
         else:
             # an 'unknown field' comment has already been issued in validate_record
             pass
@@ -605,16 +607,16 @@ def validate_record(record):
     for field, value in record.rec_headers.headers:
         field_l = field.lower()
         if field != 'warc-concurrent-to' and field_l in seen_fields:
-            commentary.error('duplicate field seen', field, value)
+            commentary.error('duplicate field seen:', field, value)
         seen_fields.add(field_l)
         if field_l not in warc_fields:
-            commentary.comment('unknown field, no validation performed', field, value)
+            commentary.comment('unknown field, no validation performed:', field, value)
             continue
         config = warc_fields[field_l]
         if 'minver' in config:
             if version < config['minver']:
                 # unknown fields are extensions, so this is a comment and not an error
-                commentary.comment('field was introduced after this warc version', field, value, version)
+                commentary.comment('field was introduced after this warc version:', version, field, value)
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 

From 8b9032d64251c773f9f6b8b82ec15d36aa1959f9 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 13:15:30 -0800
Subject: [PATCH 18/68] use valid record ids

---
 .../data/standard-torture-validate-field.warc | 106 +++++++++---------
 .../standard-torture-validate-record.warc     |  32 +++---
 test/test_tests.py                            |  62 ++++------
 warcio/tester.py                              |  30 ++---
 4 files changed, 107 insertions(+), 123 deletions(-)

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
index c88d3ee6..816413be 100644
--- a/test/data/standard-torture-validate-field.warc
+++ b/test/data/standard-torture-validate-field.warc
@@ -1,53 +1,53 @@
-WARC/1.0
-WARC-Target-URI: <http://example.com/>
-WARC-Target-URI: example.com
-WARC-Target-URI: ex ample.com
-WARC-Target-URI: h<>ttp://example.com/
-WARC-Type: does-not-exist
-WARC-Type: CAPITALIZED
-WARC-Concurrent-To: http://example.com/
-WARC-Record-ID: <foo:bar>
-WARC-Date: 2017-03-06T04:03:53Z
-WARC-Date: 2017-03-06T04:03:53.Z
-Content-Type: asdf
-Content-Type: has space/asdf
-Content-Type: asdf/has space
-Content-Type: asdf/has space;asdf
-WARC-Block-Digest: asdf
-WARC-Block-Digest: has space:asdf
-WARC-Block-Digest: sha1:&$*^&*^#*&^
-WARC-IP-Address: 1.2.3.4.5
-WARC-Truncated: invalid
-WARC-Warcinfo-ID: asdf:asdf
-WARC-Filename: not-yet-tested
-WARC-Profile: asdf
-WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
-WARC-Identified-Payload-Type: asdf
-WARC-Segment-Origin-ID: http://example.com
-WARC-Segment-Number: not-an-integer
-WARC-Segment-Number: 0
-WARC-Segment-Number: 1
-WARC-Segment-Number: 2
-WARC-Segment-Total-Length: 0
-WARC-Segment-Total-Length: not-an-integer
-WARC-Refers-To-Target-URI: http://example.com
-WARC-Refers-To-Date: not-a-date
-WARC-Unknown-Field: asdf
-Content-Length: 0
-
-
-WARC/1.1
-WARC-Date: 2017-03-06T04:03:53Z
-WARC-Date: 2017-03-06T04:03:53.Z
-WARC-Date: 2017-03-06T04:03:53.0Z
-WARC-Type: invalid
-Content-Length: 0
-
-
-WARC/1.1
-WARC-Type: request
-WARC-Segment-Number: 1
-Content-Length: 0
-
-
-WARC/invalid
+WARC/1.0
+WARC-Target-URI: <http://example.com/>
+WARC-Target-URI: example.com
+WARC-Target-URI: ex ample.com
+WARC-Target-URI: h<>ttp://example.com/
+WARC-Type: does-not-exist
+WARC-Type: CAPITALIZED
+WARC-Concurrent-To: http://example.com/
+WARC-Record-ID: <urn:uuid:torture-validate-field>
+WARC-Date: 2017-03-06T04:03:53Z
+WARC-Date: 2017-03-06T04:03:53.Z
+Content-Type: asdf
+Content-Type: has space/asdf
+Content-Type: asdf/has space
+Content-Type: asdf/has space;asdf
+WARC-Block-Digest: asdf
+WARC-Block-Digest: has space:asdf
+WARC-Block-Digest: sha1:&$*^&*^#*&^
+WARC-IP-Address: 1.2.3.4.5
+WARC-Truncated: invalid
+WARC-Warcinfo-ID: asdf:asdf
+WARC-Filename: not-yet-tested
+WARC-Profile: asdf
+WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+WARC-Identified-Payload-Type: asdf
+WARC-Segment-Origin-ID: http://example.com
+WARC-Segment-Number: not-an-integer
+WARC-Segment-Number: 0
+WARC-Segment-Number: 1
+WARC-Segment-Number: 2
+WARC-Segment-Total-Length: 0
+WARC-Segment-Total-Length: not-an-integer
+WARC-Refers-To-Target-URI: http://example.com
+WARC-Refers-To-Date: not-a-date
+WARC-Unknown-Field: asdf
+Content-Length: 0
+
+
+WARC/1.1
+WARC-Date: 2017-03-06T04:03:53Z
+WARC-Date: 2017-03-06T04:03:53.Z
+WARC-Date: 2017-03-06T04:03:53.0Z
+WARC-Type: invalid
+Content-Length: 0
+
+
+WARC/1.1
+WARC-Type: request
+WARC-Segment-Number: 1
+Content-Length: 0
+
+
+WARC/invalid
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index fa03b38e..da6a2aaf 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -15,7 +15,7 @@ token cannot have a space:
 
 
 WARC/1.0
-WARC-Record-ID: test-empty-warc-fields
+WARC-Record-ID: <uri:uuid:test-empty-warc-fields>
 WARC-Type: warcinfo
 Content-Type: application/warc-fields
 Content-Length: 0
@@ -23,7 +23,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: warcinfo
-WARC-Record-ID: test-warcinfo-non-recommended-content-type
+WARC-Record-ID: <uri:uuid:test-warcinfo-non-recommended-content-type>
 Content-Type: not-application/warc-fields
 Content-Length: 5
 
@@ -32,7 +32,7 @@ foo
 
 WARC/1.0
 WARC-Type: response
-WARC-Record-ID: test-response-content-type
+WARC-Record-ID: <uri:uuid:test-response-content-type>
 WARC-Target-URI: HtTp://example.com/
 Content-Type: text/plain
 Content-Length: 0
@@ -40,7 +40,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
-WARC-Record-ID: test-resource-dns-content-type
+WARC-Record-ID: <uri:uuid:test-resource-dns-content-type>
 WARC-Target-URI: DnS:asdfasdf
 Content-Type: text/plain
 Content-Length: 0
@@ -48,7 +48,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
-WARC-Record-ID: test-resource-dns-empty
+WARC-Record-ID: <uri:uuid:test-resource-dns-empty>
 WARC-Test-TODO: add another with valid block
 WARC-Target-URI: DnS:asdfasdf
 Content-Type: text/dns
@@ -57,14 +57,14 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
-WARC-Record-ID: test-resource-not-dns
+WARC-Record-ID: <uri:uuid:test-resource-not-dns>
 WARC-Target-URI: foo:bar
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: request
-WARC-Record-ID: test-request-unrecommended-content-type
+WARC-Record-ID: <uri:uuid:test-request-content-type>
 WARC-Target-URI: hTtP://example.com/
 Content-Type: text/plain
 Content-Length: 0
@@ -72,7 +72,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: request
-WARC-Record-ID: test-request-unrecommended-content-type-with-ip
+WARC-Record-ID: <uri:uuid:test-request-content-type-with-ip>
 WARC-Target-URI: hTtP://example.com/
 WARC-IP-Address: 1.2.3.4
 Content-Type: text/plain
@@ -81,55 +81,55 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: metadata
-WARC-Record-ID: test-metadata-warc-fields-empty
+WARC-Record-ID: <uri:uuid:test-metadata-warc-fields-empty>
 Content-Type: application/warc-fields
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: metadata
-WARC-Record-ID: test-metadata-not-warc-fields
+WARC-Record-ID: <uri:uuid:test-metadata-not-warc-fields>
 Content-Type: not-application/warc-fields
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
-WARC-Record-ID: test-revisit-profile-unknown
+WARC-Record-ID: <uri:uuid:test-revisit-profile-unknown>
 WARC-Profile: none
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
-WARC-Record-ID: test-revisit-profile-future
+WARC-Record-ID: <uri:uuid:test-revisit-profile-future>
 WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
-WARC-Record-ID: test-revisit-profile-good
+WARC-Record-ID: <uri:uuid:test-revisit-profile-good>
 WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: conversion
-WARC-Record-ID: test-conversion
+WARC-Record-ID: <uri:uuid:test-conversion>
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: continuation
-WARC-Record-ID: test-continuation-segment-1
+WARC-Record-ID: <uri:uuid:test-continuation-segment-1>
 WARC-Segment-Number: 1
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: continuation
-WARC-Record-ID: test-continuation-segment-valid
+WARC-Record-ID: <uri:uuid:test-continuation-segment-valid>
 WARC-Segment-Number: 2
 Content-Length: 0
 
diff --git a/test/test_tests.py b/test/test_tests.py
index 91eba656..c08a19f6 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -51,80 +51,68 @@ def test_torture_validate_record():
     comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
     comment: Missing colon in warc-fields line: no colon
     comment: Invalid warc-fields name: token cannot have a space
-  WARC-Record-ID test-empty-warc-fields
+  WARC-Record-ID <uri:uuid:test-empty-warc-fields>
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-empty-warc-fields
     error: missing required header: WARC-Date
     comment: warc-fields body present but empty
-  WARC-Record-ID test-warcinfo-non-recommended-content-type
+  WARC-Record-ID <uri:uuid:test-warcinfo-non-recommended-content-type>
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-warcinfo-non-recommended-content-type
     error: missing required header: WARC-Date
-    recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw: not-application/warc-fields
-  WARC-Record-ID test-response-content-type
+    recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields
+  WARC-Record-ID <uri:uuid:test-response-content-type>
     WARC-Type response
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-response-content-type
     error: missing required header: WARC-Date
-    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw: text/plain
+    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain
     error: WARC-IP-Address should be used for http and https responses
-  WARC-Record-ID test-resource-dns-content-type
+  WARC-Record-ID <uri:uuid:test-resource-dns-content-type>
     WARC-Type resource
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-resource-dns-content-type
     error: missing required header: WARC-Date
-    error: recource records for dns: shall have Content-Type of text/dns, saw: text/plain
-  WARC-Record-ID test-resource-dns-empty
+    error: resource records for dns shall have Content-Type of text/dns: text/plain
+  WARC-Record-ID <uri:uuid:test-resource-dns-empty>
     WARC-Type resource
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-resource-dns-empty
     error: missing required header: WARC-Date
     comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block
-  WARC-Record-ID test-resource-not-dns
+  WARC-Record-ID <uri:uuid:test-resource-not-dns>
     WARC-Type resource
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-resource-not-dns
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
-  WARC-Record-ID test-request-unrecommended-content-type
+  WARC-Record-ID <uri:uuid:test-request-content-type>
     WARC-Type request
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type
     error: missing required header: WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
     error: WARC-IP-Address should be used for http and https requests
-  WARC-Record-ID test-request-unrecommended-content-type-with-ip
+  WARC-Record-ID <uri:uuid:test-request-content-type-with-ip>
     WARC-Type request
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type-with-ip
     error: missing required header: WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain
-  WARC-Record-ID test-metadata-warc-fields-empty
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
+  WARC-Record-ID <uri:uuid:test-metadata-warc-fields-empty>
     WARC-Type metadata
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-metadata-warc-fields-empty
     error: missing required header: WARC-Date
     comment: warc-fields body present but empty
-  WARC-Record-ID test-metadata-not-warc-fields
+  WARC-Record-ID <uri:uuid:test-metadata-not-warc-fields>
     WARC-Type metadata
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-metadata-not-warc-fields
     error: missing required header: WARC-Date
-  WARC-Record-ID test-revisit-profile-unknown
+  WARC-Record-ID <uri:uuid:test-revisit-profile-unknown>
     WARC-Type revisit
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-revisit-profile-unknown
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
     comment: extension seen: WARC-Profile none
     comment: no revisit details validation done due to unknown profile: none
-  WARC-Record-ID test-revisit-profile-future
+  WARC-Record-ID <uri:uuid:test-revisit-profile-future>
     WARC-Type revisit
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-revisit-profile-future
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
@@ -133,34 +121,30 @@ def test_torture_validate_record():
     recommendation: missing recommended header: WARC-Refers-To-Date
     recommendation: missing recommended header: WARC-Refers-To-Target-URI
     comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
-  WARC-Record-ID test-revisit-profile-good
+  WARC-Record-ID <uri:uuid:test-revisit-profile-good>
     WARC-Type revisit
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-revisit-profile-good
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
     recommendation: missing recommended header: WARC-Refers-To
     recommendation: missing recommended header: WARC-Refers-To-Date
-  WARC-Record-ID test-conversion
+  WARC-Record-ID <uri:uuid:test-conversion>
     WARC-Type conversion
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-conversion
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
-  WARC-Record-ID test-continuation-segment-1
+  WARC-Record-ID <uri:uuid:test-continuation-segment-1>
     WARC-Type continuation
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-continuation-segment-1
     error: missing required header: WARC-Date
     error: missing required header: WARC-Segment-Origin-ID
     error: missing required header: WARC-Target-URI
-    error: continuation record must have WARC-Segment-Number > 1, saw: 1
+    error: continuation record must have WARC-Segment-Number > 1: 1
     comment: warcio test continuation code has not been tested, expect bugs
-  WARC-Record-ID test-continuation-segment-valid
+  WARC-Record-ID <uri:uuid:test-continuation-segment-valid>
     WARC-Type continuation
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-continuation-segment-valid
     error: missing required header: WARC-Date
     error: missing required header: WARC-Segment-Origin-ID
     error: missing required header: WARC-Target-URI
@@ -184,7 +168,7 @@ def test_torture_validate_field():
 
     expected = """\
 test/data/standard-torture-validate-field.warc
-  WARC-Record-ID <foo:bar>
+  WARC-Record-ID <urn:uuid:torture-validate-field>
     WARC-Type does-not-exist
     unknown hash algorithm name in block digest
     error: uri must not be within <>: WARC-Target-URI <http://example.com/>
diff --git a/warcio/tester.py b/warcio/tester.py
index 4ee05f1f..023cdb29 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -126,7 +126,7 @@ def validate_warc_fields(record, commentary):
 def validate_warcinfo(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
-        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw:', content_type)
+        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type)
     else:
         #   format: warc-fields
         #   allowable fields include but not limited to DMCI plus the following
@@ -147,7 +147,7 @@ def validate_response(record, commentary, pending):
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
         if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}:
-            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw:', content_type)
+            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
@@ -164,7 +164,7 @@ def validate_resource(record, commentary, pending):
     if target_uri.startswith('dns:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
         if content_type.lower() != 'text/dns':
-            commentary.error('recource records for dns: shall have Content-Type of text/dns, saw:', content_type)
+            commentary.error('resource records for dns shall have Content-Type of text/dns:', content_type)
         else:
             # rfc 2540 and rfc 1035
             #validate_text_dns()
@@ -180,7 +180,7 @@ def validate_request(record, commentary, pending):
         content_type = record.rec_headers.get_header('Content-Type')
 
         if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}:
-            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw:', content_type)
+            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https requests')
@@ -240,12 +240,12 @@ def validate_continuation(record, commentary, pending):
 
     segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none')
     if segment_number.isdigit() and int(segment_number) < 2:
-        commentary.error('continuation record must have WARC-Segment-Number > 1, saw:', segment_number)
+        commentary.error('continuation record must have WARC-Segment-Number > 1:', segment_number)
 
     # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated
 
 
-def validate_actual_uri(field, value, record, version, commentary, pending):
+def validate_unbracketed_uri(field, value, record, version, commentary, pending):
     # uri per RFC 3986
     # should use a registered scheme
     # %XX encoding, normalize to upper case
@@ -272,16 +272,16 @@ def validate_warc_type(field, value, record, version, commentary, pending):
         commentary.comment('unknown WARC-Type:', field, value)
 
 
-def validate_uri(field, value, record, version, commentary, pending):
+def validate_bracketed_uri(field, value, record, version, commentary, pending):
     # < uri >
     if not (value.startswith('<') and value.endswith('>')):
         commentary.error('uri must be within <>:', field, value)
         return
-    validate_actual_uri(field, value[1:-1], record, version, commentary, pending)
+    validate_unbracketed_uri(field, value[1:-1], record, version, commentary, pending)
 
 
 def validate_record_id(field, value, record, version, commentary, pending):
-    validate_uri(field, value, record, version, commentary, pending)
+    validate_bracketed_uri(field, value, record, version, commentary, pending)
     # TODO: should be "globally unique for its period of intended use"
 
 
@@ -379,7 +379,7 @@ def validate_truncated(field, value, record, version, commentary, pending):
 
 
 def validate_warcinfo_id(field, value, record, version, commentary, pending):
-    validate_uri(field, value, record, version, commentary, pending)
+    validate_bracketed_uri(field, value, record, version, commentary, pending)
     # TODO: should point at a warcinfo record
 
 
@@ -446,7 +446,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_content_type,
     },
     'WARC-Concurrent-To': {
-        'validate': validate_uri,
+        'validate': validate_bracketed_uri,
     },
     'WARC-Block-Digest': {
         'validate': validate_digest,
@@ -458,10 +458,10 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_ip,
     },
     'WARC-Refers-To': {
-        'validate': validate_uri,
+        'validate': validate_bracketed_uri,
     },
     'WARC-Target-URI': {
-        'validate': validate_actual_uri,
+        'validate': validate_unbracketed_uri,
     },
     'WARC-Truncated': {
         'validate': validate_truncated,
@@ -479,7 +479,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_content_type,
     },
     'WARC-Segment-Origin-ID': {
-        'validate': validate_uri,
+        'validate': validate_bracketed_uri,
     },
     'WARC-Segment-Number': {
         'validate': validate_segment_number,
@@ -488,7 +488,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_segment_total_length,
     },
     'WARC-Refers-To-Target-URI': {
-        'validate': validate_actual_uri,
+        'validate': validate_unbracketed_uri,
         'minver': '1.1',
     },
     'WARC-Refers-To-Date': {

From 2a10b23aafa5f7023dea11b7a41b8f6f0525331e Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 13:31:40 -0800
Subject: [PATCH 19/68] warc-segment-number cleaner recommendation

---
 test/test_tests.py |  1 -
 warcio/tester.py   | 12 ++++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index c08a19f6..dcbc3666 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -237,7 +237,6 @@ def test_torture_validate_field():
     error: missing required header: WARC-Record-ID
     error: missing required header: WARC-Target-URI
     recommendation: do not segment WARC-Type request
-    comment: Unknown field for this record type, perhaps an extension: request WARC-Segment-Number
 """
 
     value = helper(args, 0)
diff --git a/warcio/tester.py b/warcio/tester.py
index 023cdb29..6346754d 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -503,20 +503,21 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'],
         'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'],
+        'ignored': ['WARC-Segment-Number'],
         'validate': validate_warcinfo,
     },
     'response': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'Content-Type', 'WARC-Target-URI'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_response,
     },
     'resource': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_resource,
     },
@@ -526,6 +527,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'ignored': ['WARC-Segment-Number'],
         'validate': validate_request,
     },
     'metadata': {
@@ -534,6 +536,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'],
         'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'ignored': ['WARC-Segment-Number'],
         'validate': validate_metadata,
     },
     'revisit': {
@@ -542,11 +545,12 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID',  # normal optionals
                      'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],  # these are for profiles
         'prohibited': ['WARC-Filename'],
+        'ignored': ['WARC-Segment-Number'],
         'validate': validate_revisit,
     },
     'conversion': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'],
-        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number'],
         'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_conversion,
     },
@@ -574,7 +578,7 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary,
     for rec in sorted(config.get('recommended', [])):
         if not rec_headers.get_header(rec):
             commentary.recommendation('missing recommended header:', rec)
-    allowed = make_header_set(config, ('required', 'optional', 'recommended'))
+    allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored'))
     prohibited = make_header_set(config, ('prohibited',))
 
     for field, value in rec_headers.headers:

From 81c9f0a4e96b82dbb8181531239fa00231b33887 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 13:55:54 -0800
Subject: [PATCH 20/68] segment origin id

---
 test/test_tests.py |  1 +
 warcio/tester.py   | 23 +++++++++++++----------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index dcbc3666..598ba49b 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -232,6 +232,7 @@ def test_torture_validate_field():
   WARC-Record-ID None
     WARC-Type request
     digest not present
+    error: segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Record-ID
diff --git a/warcio/tester.py b/warcio/tester.py
index 6346754d..632de060 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -420,6 +420,9 @@ def validate_segment_number(field, value, record, version, commentary, pending):
     if rec_type != 'continuation':
         if iv != 1:
             commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value)
+        origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID')
+        if origin_id is None:
+            commentary.error('segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID')
     if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
         commentary.recommendation('do not segment WARC-Type', rec_type)
 
@@ -503,21 +506,21 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'],
         'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'],
-        'ignored': ['WARC-Segment-Number'],
+        'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'validate': validate_warcinfo,
     },
     'response': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'Content-Type', 'WARC-Target-URI'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_response,
     },
     'resource': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_resource,
     },
@@ -527,7 +530,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
-        'ignored': ['WARC-Segment-Number'],
+        'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'validate': validate_request,
     },
     'metadata': {
@@ -536,7 +539,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'],
         'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
-        'ignored': ['WARC-Segment-Number'],
+        'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'validate': validate_metadata,
     },
     'revisit': {
@@ -545,18 +548,18 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID',  # normal optionals
                      'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],  # these are for profiles
         'prohibited': ['WARC-Filename'],
-        'ignored': ['WARC-Segment-Number'],
+        'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'validate': validate_revisit,
     },
     'conversion': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'],
-        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_conversion,
     },
     'continuation': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
-                     'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'],
+                     'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'],
         'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'],
         'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_continuation,
@@ -587,8 +590,8 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary,
             commentary.error('field not allowed in record type:', rec_type, field)
         elif allow_all or fl in allowed:
             pass
-        elif fl in warc_fields:
-            commentary.comment('Unknown field for this record type, perhaps an extension:', rec_type, field)
+        elif fl in warc_fields:  # pragma: no cover (this is a configuration error, if it happens)
+            commentary.comment('Known field, but not expected for this record type:', rec_type, field)
         else:
             # an 'unknown field' comment has already been issued in validate_record
             pass

From c78343a166264f7df0889865d261a768c46fe5ad Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 14:51:19 -0800
Subject: [PATCH 21/68] timestamp checking

---
 test/test_tests.py |  6 ++++--
 warcio/tester.py   | 21 ++++++++-------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index 598ba49b..89851eca 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -184,7 +184,8 @@ def test_torture_validate_field():
     error: duplicate field seen: WARC-Type CAPITALIZED
     error: uri must be within <>: WARC-Concurrent-To http://example.com/
     error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
-    error: WARC 1.0 time may not have fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
+    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
+    error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
     error: must contain a /: Content-Type asdf
     error: invalid subtype: Content-Type asdf
     error: duplicate field seen: Content-Type has space/asdf
@@ -212,6 +213,7 @@ def test_torture_validate_field():
     error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2
     error: duplicate field seen: WARC-Segment-Total-Length not-an-integer
     error: must be an integer: WARC-Segment-Total-Length not-an-integer
+    error: Invalid timestamp: WARC-Refers-To-Date not-a-date
     comment: unknown WARC-Type: WARC-Type does-not-exist
     comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED
     comment: unknown WARC-Type: WARC-Type CAPITALIZED
@@ -226,7 +228,7 @@ def test_torture_validate_field():
     WARC-Type invalid
     digest not present
     error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
-    error: fractional seconds must have 1-9 digits: WARC-Date 2017-03-06T04:03:53.Z
+    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
     error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z
     comment: unknown WARC-Type: WARC-Type invalid
   WARC-Record-ID None
diff --git a/warcio/tester.py b/warcio/tester.py
index 632de060..5396ff3b 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -286,21 +286,16 @@ def validate_record_id(field, value, record, version, commentary, pending):
 
 
 def validate_timestamp(field, value, record, version, commentary, pending):
-    use_ms = False if version == '1.0' else True
-    if not use_ms:
-        if '.' in value:
-            # XXX specification infelicity: would be nice to have 'advice to implementers' here
-            commentary.error('WARC 1.0 time may not have fractional seconds:', field, value)
-    else:
-        if '.' in value:
-            start, end = value.split('.', 1)
-            if not re.search(r'\A[0-9]{1,9}Z\Z', end):
-                commentary.error('fractional seconds must have 1-9 digits:', field, value)
+    ISO_RE = r'\A\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:.\d{1,9})?Z\Z'
 
-    # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
+    if not re.match(ISO_RE, value):
+        commentary.error('Invalid timestamp:', field, value)
 
-    # TODO: "multiple records written as part of a single capture event shall use the same WARC-Date"
-    # how? follow WARC-Concurrent-To pointer(s) from request to response(s)
+    use_ms = False if version <= '1.0' else True
+    if not use_ms:
+        if '.' in value:
+            # specification infelicity: would be nice to have 'advice to implementers' here
+            commentary.error('WARC versions <= 1.0 may not have timestamps with fractional seconds:', field, value)
 
 
 def validate_content_length(field, value, record, version, commentary, pending):

From efe0fdab9178f440b4c6682482ca6110e73d020e Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 16:46:59 -0800
Subject: [PATCH 22/68] buglet

---
 test/data/standard-torture-validate-field.warc | 1 +
 warcio/tester.py                               | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
index 816413be..126ba964 100644
--- a/test/data/standard-torture-validate-field.warc
+++ b/test/data/standard-torture-validate-field.warc
@@ -6,6 +6,7 @@ WARC-Target-URI: h<>ttp://example.com/
 WARC-Type: does-not-exist
 WARC-Type: CAPITALIZED
 WARC-Concurrent-To: http://example.com/
+WARC-Concurrent-To: <uri:urn:asdf-asdf-asdf>
 WARC-Record-ID: <urn:uuid:torture-validate-field>
 WARC-Date: 2017-03-06T04:03:53Z
 WARC-Date: 2017-03-06T04:03:53.Z
diff --git a/warcio/tester.py b/warcio/tester.py
index 5396ff3b..8e9d8da3 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -608,7 +608,7 @@ def validate_record(record):
     seen_fields = set()
     for field, value in record.rec_headers.headers:
         field_l = field.lower()
-        if field != 'warc-concurrent-to' and field_l in seen_fields:
+        if field_l != 'warc-concurrent-to' and field_l in seen_fields:
             commentary.error('duplicate field seen:', field, value)
         seen_fields.add(field_l)
         if field_l not in warc_fields:

From 7a2664405e5e9246147bb23ecf6e6bdc813e0db3 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Tue, 29 Jan 2019 17:52:05 -0800
Subject: [PATCH 23/68] global checks

---
 .../data/standard-torture-validate-field.warc |   2 +
 test/test_tests.py                            |  51 +++-
 warcio/tester.py                              | 278 +++++++++++++++---
 3 files changed, 276 insertions(+), 55 deletions(-)

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
index 126ba964..a928a4c4 100644
--- a/test/data/standard-torture-validate-field.warc
+++ b/test/data/standard-torture-validate-field.warc
@@ -33,6 +33,8 @@ WARC-Segment-Total-Length: 0
 WARC-Segment-Total-Length: not-an-integer
 WARC-Refers-To-Target-URI: http://example.com
 WARC-Refers-To-Date: not-a-date
+WARC-Refers-To-Filename: asdf
+WARC-Refers-To-File-Offset: 1234
 WARC-Unknown-Field: asdf
 Content-Length: 0
 
diff --git a/test/test_tests.py b/test/test_tests.py
index 89851eca..ebbdb509 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -6,6 +6,14 @@
 from .test_cli import patch_stdout
 
 
+file_map = {}
+
+
+def map_test_file(filename):
+    file_map[filename] = get_test_file(filename)
+    return file_map[filename]
+
+
 def helper(args, expected_exit_value):
     with patch_stdout() as buff:
         exit_value = None
@@ -22,17 +30,16 @@ def helper(args, expected_exit_value):
 def remove_before_test_data(s):
     ret = ''
     for line in s.splitlines(True):
-        if '/test/data/' in line:
-            line = 'test/data/' + line.split('/test/data/', 1)[1]
-        if '\\test\\data\\' in line:
-            line = 'test/data/' + line.split('\\test\\data\\', 1)[1]
+        for filename, value in file_map.items():
+            if value in line:
+                line = line.replace(value, 'test/data/' + filename)
         ret += line
     return ret
 
 
 def test_torture_validate_record():
     files = ['standard-torture-validate-record.warc']
-    files = [get_test_file(filename) for filename in files]
+    files = [map_test_file(filename) for filename in files]
 
     args = ['test']
     args.extend(files)
@@ -55,7 +62,7 @@ def test_torture_validate_record():
     WARC-Type warcinfo
     digest not present
     error: missing required header: WARC-Date
-    comment: warc-fields body present but empty
+    comment: warc-fields block present but empty
   WARC-Record-ID <uri:uuid:test-warcinfo-non-recommended-content-type>
     WARC-Type warcinfo
     digest not present
@@ -67,6 +74,7 @@ def test_torture_validate_record():
     error: missing required header: WARC-Date
     error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain
     error: WARC-IP-Address should be used for http and https responses
+    error: http/https responses should have http headers
   WARC-Record-ID <uri:uuid:test-resource-dns-content-type>
     WARC-Type resource
     digest not present
@@ -97,7 +105,7 @@ def test_torture_validate_record():
     WARC-Type metadata
     digest not present
     error: missing required header: WARC-Date
-    comment: warc-fields body present but empty
+    comment: warc-fields block present but empty
   WARC-Record-ID <uri:uuid:test-metadata-not-warc-fields>
     WARC-Type metadata
     digest not present
@@ -108,7 +116,7 @@ def test_torture_validate_record():
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
-    comment: extension seen: WARC-Profile none
+    comment: unknown value, perhaps an extension: WARC-Profile none
     comment: no revisit details validation done due to unknown profile: none
   WARC-Record-ID <uri:uuid:test-revisit-profile-future>
     WARC-Type revisit
@@ -120,7 +128,7 @@ def test_torture_validate_record():
     recommendation: missing recommended header: WARC-Refers-To
     recommendation: missing recommended header: WARC-Refers-To-Date
     recommendation: missing recommended header: WARC-Refers-To-Target-URI
-    comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+    comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
   WARC-Record-ID <uri:uuid:test-revisit-profile-good>
     WARC-Type revisit
     digest not present
@@ -161,7 +169,7 @@ def test_torture_validate_record():
 
 def test_torture_validate_field():
     files = ['standard-torture-validate-field.warc']
-    files = [get_test_file(filename) for filename in files]
+    files = [map_test_file(filename) for filename in files]
 
     args = ['test']
     args.extend(files)
@@ -219,10 +227,12 @@ def test_torture_validate_field():
     comment: unknown WARC-Type: WARC-Type CAPITALIZED
     comment: unknown digest algorithm: WARC-Block-Digest asdf
     comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^
-    comment: extension seen: WARC-Truncated invalid
-    comment: extension seen: WARC-Profile asdf
+    comment: unknown value, perhaps an extension: WARC-Truncated invalid
+    comment: unknown value, perhaps an extension: WARC-Profile asdf
     comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com
     comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date
+    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf
+    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234
     comment: unknown field, no validation performed: WARC-Unknown-Field asdf
   WARC-Record-ID None
     WARC-Type invalid
@@ -240,6 +250,11 @@ def test_torture_validate_field():
     error: missing required header: WARC-Record-ID
     error: missing required header: WARC-Target-URI
     recommendation: do not segment WARC-Type request
+global warcinfo checks
+  comment: WARC-Warcinfo-ID not found: <urn:uuid:torture-validate-field> WARC-Warcinfo-ID asdf:asdf
+global Concurrent-To checks
+  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To <uri:urn:asdf-asdf-asdf>
+  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To http://example.com/
 """
 
     value = helper(args, 0)
@@ -251,7 +266,7 @@ def test_torture_validate_field():
 
 def test_arc():
     files = ['does-not-exist.arc']
-    files = [get_test_file(filename) for filename in files]
+    files = [map_test_file(filename) for filename in files]
 
     args = ['test']
     args.extend(files)
@@ -267,7 +282,7 @@ def test_arc():
 def test_digests():
     # needed for test coverage
     files = ['example-digest-bad.warc', 'example.warc']
-    files = [get_test_file(filename) for filename in files]
+    files = [map_test_file(filename) for filename in files]
 
     args = ['test']
     args.extend(files)
@@ -282,23 +297,28 @@ def test_digests():
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
 test/data/example.warc
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest not present
     error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example.warc test/data/example-digest-bad.warc
   WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
     WARC-Type revisit
     digest present but not checked
     recommendation: missing recommended header: WARC-Refers-To
+    comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest
     comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
     comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
   WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
@@ -318,12 +338,11 @@ def test_leftovers():
     # hard to test because invalid WARC Content-Length raises in archiveiterator
     warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None)
 
-    # hard to test because warcio checks the WARC version
+    # hard to test because warcio raises for unknown WARC version
     warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
 
     expected = '''\
 error: must be an integer: Content-Length not-an-integer
-comment: no profile check because unknown warc version: blah blah
 '''
 
     assert '\n'.join(commentary.comments())+'\n' == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index 8e9d8da3..870c7d6e 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -3,14 +3,15 @@
 import re
 import sys
 import six
+from collections import defaultdict
 
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
 from warcio.exceptions import ArchiveLoadFailed
 
 
-class Commentary:
-    def __init__(self, record_id, rec_type):
+class Commentary(object):
+    def __init__(self, record_id=None, rec_type=None):
         self._record_id = record_id
         self._rec_type = rec_type
         self.errors = []
@@ -37,6 +38,7 @@ def has_comments(self):
             return True
 
     def comments(self):
+        # XXX str() all of these, in case an int or other thing slips in?
         for e in self.errors:
             yield 'error: ' + ' '.join(e)
         for r in self.recommendations:
@@ -55,6 +57,13 @@ def __getattr__(self, name):
             if self._content is None:
                 self._content = self.obj.content_stream().read()
             return self._content
+        if name == 'stream_for_digest_check':
+            def _doit():
+                while True:
+                    piece = self.obj.content_stream().read(1024*1024)
+                    if len(piece) == 0:
+                        break
+            return _doit
         return getattr(self.__dict__['obj'], name)
 
 
@@ -117,7 +126,7 @@ def validate_warc_fields(record, commentary):
         first_line = False
 
     if not lines:
-        commentary.comment('warc-fields body present but empty')
+        commentary.comment('warc-fields block present but empty')
         return
 
     # check known fields
@@ -126,6 +135,7 @@ def validate_warc_fields(record, commentary):
 def validate_warcinfo(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
+        # https://github.com/iipc/warc-specifications/issues/33 -- SHALL BE or recommended?
         commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type)
     else:
         #   format: warc-fields
@@ -137,8 +147,8 @@ def validate_warcinfo(record, commentary, pending):
         validate_warc_fields(record, commentary)
 
     # whole-file tests:
-    # optional that warcinfo be first in file, still deserves a comment
-    # allowable for warcinfo to appear anywhere
+    # recommended that all files start with warcinfo
+    # elsewise allowable for warcinfo to appear anywhere
 
 
 def validate_response(record, commentary, pending):
@@ -152,10 +162,32 @@ def validate_response(record, commentary, pending):
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
 
-        # error: http and https schemes should have http response headers
-        #   test by attempting to parse them?
+        if not record.http_headers:
+            commentary.error('http/https responses should have http headers')
+            return
 
-        # comment: verify http content-length, if present -- commoncrawl nutch bug
+        http_content_length = record.http_headers.get_header('Content-Length')
+        if http_content_length is None:
+            return
+
+        if not http_content_length.isdigit():
+            commentary.comment('http content length header is not an integer', str(http_content_length))
+            return
+
+        # We want to verify http_content_length, which is the size of the compressed payload
+        # Trying to catch that commoncrawl nutch bug that prefixed /r/n to the payload without changing http content-length
+
+        # this blecherous hack is because we need the length of the (possibly compressed) raw stream
+        # without reading any of it (so that it can be read elsewhere to check the payload digest)
+
+        # XXX fix me before shipping :-D
+
+        if hasattr(record, 'raw_stream'):
+            if hasattr(record.raw_stream, 'stream'):
+                if hasattr(record.raw_stream.stream, 'limit'):
+                    if int(http_content_length) != record.raw_stream.stream.limit:
+                        commentary.comment('Actual http payload length is different from http header Content-Length:',
+                                           str(record.raw_stream.stream.limit), http_content_length)
 
 
 def validate_resource(record, commentary, pending):
@@ -171,6 +203,7 @@ def validate_resource(record, commentary, pending):
             pass
 
     # should never have http headers
+    #   heuristic of looking for an http status line? and then a blank line?!
 
 
 def validate_request(record, commentary, pending):
@@ -193,6 +226,8 @@ def validate_request(record, commentary, pending):
 def validate_metadata(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() == 'application/warc-fields':
+        # https://github.com/iipc/warc-specifications/issues/33 SHALL be or not?
+        #
         # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6
         # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it
         # hopsFromSeed: string
@@ -206,8 +241,11 @@ def validate_revisit(record, commentary, pending):
     if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'):
         config = {
             'required': ['WARC-Payload-Digest'],
-            'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],
+            'recommended': ['WARC-Refers-To'],
         }
+        if '/1.1/' in warc_profile:
+            config['recommended'].extend(('WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'))
+
         validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
         # may have record block;
         #  if not, shall have Content-Length: 0,
@@ -282,7 +320,6 @@ def validate_bracketed_uri(field, value, record, version, commentary, pending):
 
 def validate_record_id(field, value, record, version, commentary, pending):
     validate_bracketed_uri(field, value, record, version, commentary, pending)
-    # TODO: should be "globally unique for its period of intended use"
 
 
 def validate_timestamp(field, value, record, version, commentary, pending):
@@ -328,8 +365,6 @@ def validate_content_type(field, value, record, version, commentary, pending):
     # at this point there can be multiple parameters,
     # some of which could have quoted string values with ; in them
 
-    # TODO: more checking
-
 
 def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
@@ -370,37 +405,45 @@ def validate_ip(field, value, record, version, commentary, pending):
 
 def validate_truncated(field, value, record, version, commentary, pending):
     if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}:
-        commentary.comment('extension seen:', field, value)
+        commentary.comment('unknown value, perhaps an extension:', field, value)
 
 
 def validate_warcinfo_id(field, value, record, version, commentary, pending):
     validate_bracketed_uri(field, value, record, version, commentary, pending)
-    # TODO: should point at a warcinfo record
 
 
 def validate_filename(field, value, record, version, commentary, pending):
-    # TODO: text or quoted-string
+    # text or quoted-string
+    # comment for dangerous utf-8 in filename?
     pass
 
 
 profiles = {
-    # XXX WARC/0.17 and WARC/0.18
+    '0.17': ['http://netpreserve.org/warc/0.17/revisit/identical-payload-digest',
+             'http://netpreserve.org/warc/0.17/revisit/server-not-modified'],
+    '0.18': ['http://netpreserve.org/warc/0.18/revisit/identical-payload-digest',
+             'http://netpreserve.org/warc/0.18/revisit/server-not-modified'],
     '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest',
             'http://netpreserve.org/warc/1.0/revisit/server-not-modified',
-            # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not?
-            # https://github.com/iipc/webarchive-commons/commits/988bec707c27a01333becfc3bd502af4441ea1e1/src/main/java/org/archive/format/warc/WARCConstants.java
             'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'],
     '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
             'http://netpreserve.org/warc/1.1/revisit/server-not-modified'],
 }
+profiles_rev = dict([(filename, version) for version, filenames in profiles.items() for filename in filenames])
 
 
 def validate_profile(field, value, record, version, commentary, pending):
     if version not in profiles:
-        commentary.comment('no profile check because unknown warc version:', field, value)
         return
-    if value not in profiles[version]:
-        commentary.comment('extension seen:', field, value)
+
+    if value in profiles_rev:
+        if profiles_rev[value] != version:
+            commentary.comment('WARC-Profile value is for a different version:', version, value)
+    else:
+        commentary.comment('unknown value, perhaps an extension:', field, value)
+
+    if '/revisit/uri-agnostic-identical-payload-digest' in value:
+        commentary.comment('This Heretrix extension never made it into the standard:', field, value)
 
 
 def validate_segment_number(field, value, record, version, commentary, pending):
@@ -427,6 +470,14 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         commentary.error('must be an integer:', field, value)
 
 
+def validate_refers_to_filename(field, value, record, version, commentary, pending):
+    commentary.comment('This Heretrix extension never made it into the standard:', field, value)
+
+
+def validate_refers_to_file_offset(field, value, record, version, commentary, pending):
+    commentary.comment('This Heretrix extension never made it into the standard:', field, value)
+
+
 warc_fields = {
     'WARC-Type': {
         'validate': validate_warc_type,
@@ -493,6 +544,12 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_timestamp,
         'minver': '1.1',
     },
+    'WARC-Refers-To-Filename': {
+        'validate': validate_refers_to_filename,
+    },
+    'WARC-Refers-To-File-Offset': {
+        'validate': validate_refers_to_file_offset,
+    },
 }
 warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()])
 
@@ -579,13 +636,13 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary,
     allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored'))
     prohibited = make_header_set(config, ('prohibited',))
 
-    for field, value in rec_headers.headers:
+    for field, value in rec_headers.headers:  # XXX not exported
         fl = field.lower()
         if fl in prohibited:
             commentary.error('field not allowed in record type:', rec_type, field)
         elif allow_all or fl in allowed:
             pass
-        elif fl in warc_fields:  # pragma: no cover (this is a configuration error, if it happens)
+        elif fl in warc_fields:  # pragma: no cover (this is a tester.py configuration omission)
             commentary.comment('Known field, but not expected for this record type:', rec_type, field)
         else:
             # an 'unknown field' comment has already been issued in validate_record
@@ -598,15 +655,15 @@ def validate_record_against_rec_type(config, record, commentary, pending):
 
 
 def validate_record(record):
-    version = record.rec_headers.protocol.split('/', 1)[1]  # XXX not exported?
+    version = record.rec_headers.protocol.split('/', 1)[1]  # XXX not exported
 
     record_id = record.rec_headers.get_header('WARC-Record-ID')
     rec_type = record.rec_headers.get_header('WARC-Type')
-    commentary = Commentary(record_id, rec_type)
+    commentary = Commentary(record_id=record_id, rec_type=rec_type)
     pending = None
 
     seen_fields = set()
-    for field, value in record.rec_headers.headers:
+    for field, value in record.rec_headers.headers:  # XXX not exported
         field_l = field.lower()
         if field_l != 'warc-concurrent-to' and field_l in seen_fields:
             commentary.error('duplicate field seen:', field, value)
@@ -617,13 +674,13 @@ def validate_record(record):
         config = warc_fields[field_l]
         if 'minver' in config:
             if version < config['minver']:
-                # unknown fields are extensions, so this is a comment and not an error
                 commentary.comment('field was introduced after this warc version:', version, field, value)
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 
     if rec_type not in record_types:
-        pass  # we print a comment for this elsewhere
+        # we print a comment for this elsewhere
+        pass
     else:
         validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary)
         validate_record_against_rec_type(record_types[rec_type], record, commentary, pending)
@@ -631,10 +688,149 @@ def validate_record(record):
     return commentary
 
 
-def _process_one(warc):
-    if warc.endswith('.arc') or warc.endswith('.arc.gz'):
+def save_global_info(record, warcfile, commentary, all_records, concurrent_to):
+    record_id = record.rec_headers.get_header('WARC-Record-ID')
+    if record_id is None:
         return
-    with open(warc, 'rb') as stream:
+
+    for field, value in record.rec_headers.headers:  # XXX not exported
+        if field.lower() == 'warc-concurrent-to':
+            if record_id is not None and value is not None:
+                concurrent_to[record_id].append(value)
+                concurrent_to[value].append(record_id)
+
+    save = {'warcfile': warcfile}
+
+    saved_fields = (
+        'WARC-Type', 'WARC-Warcinfo-ID', 'WARC-Date'
+        'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Payload-Digest', 'WARC-Target-URI',
+        'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Segment-Total-Length', 'WARC-Truncated'
+    )
+    saved_fields = set([x.lower() for x in saved_fields])
+
+    for field, value in record.rec_headers.headers:  # XXX not exported
+        field_l = field.lower()
+        if field_l in saved_fields and value is not None:
+            save[field_l] = value
+        if field_l == 'warc-concurrent-to':
+            if 'warc-concurrent-to' not in save:
+                save['warc-concurrent-to'] = []
+            save['warc-concurrent-to'].append(value)
+
+    if record_id in all_records:
+        commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile'])
+    else:
+        all_records[record_id] = save
+
+
+def check_global(all_records, concurrent_to):
+    check_global_warcinfo(all_records)
+    check_global_concurrent_to(all_records, concurrent_to)
+    check_global_refers_to(all_records)
+    check_global_segment(all_records)
+
+
+def _print_global(header, commentary):
+    if commentary.has_comments():
+        print(header)
+        for c in commentary.comments():
+            print(' ', c)
+
+
+def check_global_warcinfo(all_records):
+    commentary = Commentary()
+    for record_id, fields in all_records.items():
+        if 'warc-warcinfo-id' in fields:
+            wanted_id = fields['warc-warcinfo-id']
+            if wanted_id not in all_records or all_records[wanted_id]['warc-type'] != 'warcinfo':
+                commentary.comment('WARC-Warcinfo-ID not found:', record_id, 'WARC-Warcinfo-ID', wanted_id)
+
+    _print_global('global warcinfo checks', commentary)
+
+
+def check_global_concurrent_to(all_records, concurrent_to):
+    commentary = Commentary()
+    for record_id, fields in all_records.items():
+        if 'warc-concurrent-to' in fields:
+            whole_set = set(fields['warc-concurrent-to'])
+            del fields['warc-concurrent-to']
+            while True:
+                current_set = list(whole_set)
+                for c in current_set:
+                    if c in all_records and 'warc-concurrent-to' in all_records[c]:
+                        whole_set.update(set(all_records[c]['warc-concurrent-to']))
+                        del all_records[c]['warc-concurrent-to']
+                if len(whole_set) == len(current_set):
+                    break
+            warc_date = fields.get('warc-date')
+            for wanted_id in sorted(whole_set):
+                if wanted_id not in all_records:
+                    commentary.comment('WARC-Concurrent-To not found:', record_id, 'WARC-Concurrent-To', wanted_id)
+                else:
+                    new_date = all_records[wanted_id].get('warc-date')
+                    if warc_date != new_date:
+                        commentary.comment('WARC-Concurrent-To set has conflicting dates:',
+                                           record_id, warc_date, wanted_id, new_date)
+
+    _print_global('global Concurrent-To checks', commentary)
+
+
+def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, target_field, commentary):
+    if source_field.lower() not in fields:
+        return
+
+    if target_field.lower() not in all_records[wanted_id]:
+        commentary.comment('revisit target lacks field:', wanted_id, target_field)
+        return
+
+    source_value = fields[source_field.lower()]
+    target_value = all_records[wanted_id][target_field.lower()]
+    if source_value != target_value:
+        commentary.comment('revisit and revisit target disagree:',
+                           record_id, source_field, source_value,
+                           wanted_id, target_field, target_value)
+
+
+def check_global_refers_to(all_records):
+    commentary = Commentary()
+    for record_id, fields in all_records.items():
+        if 'warc-refers-to' not in fields:
+            continue
+
+        wanted_id = fields['warc-refers-to']
+        if wanted_id not in all_records:
+            commentary.comment('WARC-Refers-To target not found:', record_id, 'Warc-Refers-To', wanted_id)
+            continue
+
+        rec_type = fields.get('warc-type')
+        if rec_type != 'revisit':
+            continue
+
+        _revisit_compare(record_id, fields, 'WARC-Refers-To-Target-URI',
+                         wanted_id, all_records, 'WARC-Target-URI', commentary)
+        _revisit_compare(record_id, fields, 'WARC-Refers-To-Date',
+                         wanted_id, all_records, 'WARC-Date', commentary)
+        _revisit_compare(record_id, fields, 'WARC-Payload-Digest',
+                         wanted_id, all_records, 'WARC-Payload-Digest', commentary)
+
+    _print_global('global Refers-To checks', commentary)
+
+
+def check_global_segment(all_records):
+    # warc-segment-origin-id :: exists, is warc-segment-number 1
+    #   all segments exist, and the last one has WARC-Segment-Total-Length
+    #   and only the last one has WARC-Truncated, if any
+
+    # Segmentation shall not be used if a record can be stored in an existing warc file
+    # The origin segment shall be placed in a new warc file preceded only by a warcinfo record (if any)
+
+    pass
+
+
+def _process_one(warcfile, all_records, concurrent_to):
+    if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'):
+        return
+    with open(warcfile, 'rb') as stream:
         for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
 
             record = WrapRecord(record)
@@ -642,10 +838,9 @@ def _process_one(warc):
                               record.rec_headers.get_header('WARC-Block-Digest'))
 
             commentary = validate_record(record)
+            save_global_info(record, warcfile, commentary, all_records, concurrent_to)
 
-            record.content  # make sure digests are checked
-            # XXX might need to read and digest the raw stream to check digests for chunked encoding?
-            # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
+            record.stream_for_digest_check()
 
             if commentary.has_comments() or record.digest_checker.passed is False:
                 print(' ', 'WARC-Record-ID', commentary.record_id())
@@ -671,16 +866,21 @@ class Tester(object):
     def __init__(self, cmd):
         self.inputs = cmd.inputs
         self.exit_value = 0
+        self.all_records = defaultdict(dict)
+        self.concurrent_to = defaultdict(list)
 
     def process_all(self):
-        for warc in self.inputs:
-            print(warc)
+        for warcfile in self.inputs:
+            print(warcfile)
             try:
-                self.process_one(warc)
+                self.process_one(warcfile)
             except ArchiveLoadFailed as e:
                 print('  saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr)
                 print('  skipping rest of file', file=sys.stderr)
+
+        check_global(self.all_records, self.concurrent_to)
+
         return self.exit_value
 
-    def process_one(self, filename):
-        _process_one(filename)
+    def process_one(self, warcfile):
+        _process_one(warcfile, self.all_records, self.concurrent_to)

From 1d6fd9d070e76e800be6821dfa5f9ec9881f3d7b Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Thu, 31 Jan 2019 12:03:44 -0800
Subject: [PATCH 24/68] check -v; capitalize most commentary

---
 warcio/cli.py    |  1 +
 warcio/tester.py | 89 +++++++++++++++++++++++++-----------------------
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/warcio/cli.py b/warcio/cli.py
index 7e40cdad..88f3445a 100644
--- a/warcio/cli.py
+++ b/warcio/cli.py
@@ -57,6 +57,7 @@ def main(args=None):
 
     test = subparsers.add_parser('test', help='WARC standards tester')
     test.add_argument('inputs', nargs='+')
+    test.add_argument('-v', '--verbose', action='store_true')
     test.set_defaults(func=tester)
 
     cmd = parser.parse_args(args=args)
diff --git a/warcio/tester.py b/warcio/tester.py
index 870c7d6e..9605ea7b 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -157,7 +157,7 @@ def validate_response(record, commentary, pending):
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
         if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}:
-            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type)
+            commentary.error('Responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
@@ -264,7 +264,7 @@ def validate_revisit(record, commentary, pending):
         #     if yes, should be like a response record, truncated if desired
         #   WARC-Refers-To-Date should be the same as WARC-Date in the original record if present
     else:
-        commentary.comment('no revisit details validation done due to unknown profile:', warc_profile)
+        commentary.comment('No revisit details validation done due to unknown profile:', warc_profile)
 
 
 def validate_conversion(record, commentary, pending):
@@ -291,14 +291,17 @@ def validate_unbracketed_uri(field, value, record, version, commentary, pending)
     if value.startswith('<') or value.endswith('>'):
         # wget 1.19 bug caused by WARC 1.0 spec error
         commentary.error('uri must not be within <>:', field, value)
+        value = value[1:-1]
+
+    scheme = value.split(':', 1)[0]
     if ':' not in value:
-        commentary.error('invalid uri, no scheme:', field, value)
+        commentary.error('Invalid uri, no scheme:', field, value)
+    elif not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
+        commentary.error('Invalid uri scheme, bad character:', field, value)
+        # use https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml ??
+
     if re.search(r'\s', value):
-        commentary.error('invalid uri, contains whitespace:', field, value)
-    scheme = value.split(':', 1)[0]
-    if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
-        commentary.error('invalid uri scheme, bad character:', field, value)
-    # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+        commentary.error('Invalid uri, contains whitespace:', field, value)
 
 
 def validate_warc_type(field, value, record, version, commentary, pending):
@@ -307,7 +310,7 @@ def validate_warc_type(field, value, record, version, commentary, pending):
         commentary.comment('WARC-Type is not lower-case:', field, value)
     if value.lower() not in record_types:
         # standard says readers should ignore unknown warc-types
-        commentary.comment('unknown WARC-Type:', field, value)
+        commentary.comment('Unknown WARC-Type:', field, value)
 
 
 def validate_bracketed_uri(field, value, record, version, commentary, pending):
@@ -337,7 +340,7 @@ def validate_timestamp(field, value, record, version, commentary, pending):
 
 def validate_content_length(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer:', field, value)
+        commentary.error('Must be an integer:', field, value)
 
 
 token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z'
@@ -346,7 +349,7 @@ def validate_content_length(field, value, record, version, commentary, pending):
 
 def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
-        commentary.error('must contain a /:', field, value)
+        commentary.error('Must contain a /:', field, value)
     splits = value.split('/', 1)
     ctype = splits[0]
     if len(splits) > 1:
@@ -354,13 +357,13 @@ def validate_content_type(field, value, record, version, commentary, pending):
     else:
         rest = ''
     if not re.search(token_re, ctype):
-        commentary.error('invalid type:', field, value)
+        commentary.error('Invalid type:', field, value)
     if ';' in rest:
         subtype, rest = rest.split(';', 1)
     else:
         subtype = rest
     if not re.search(token_re, subtype):
-        commentary.error('invalid subtype:', field, value)
+        commentary.error('Invalid subtype:', field, value)
 
     # at this point there can be multiple parameters,
     # some of which could have quoted string values with ; in them
@@ -368,7 +371,7 @@ def validate_content_type(field, value, record, version, commentary, pending):
 
 def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
-        commentary.error('missing algorithm:', field, value)
+        commentary.error('Missing algorithm:', field, value)
     splits = value.split(':', 1)
     algorithm = splits[0]
     if len(splits) > 1:
@@ -376,12 +379,12 @@ def validate_digest(field, value, record, version, commentary, pending):
     else:
         digest = 'none'
     if not re.search(token_re, algorithm):
-        commentary.error('invalid algorithm:', field, value)
+        commentary.error('Invalid algorithm:', field, value)
     else:
         try:
             Digester(algorithm)
         except ValueError:
-            commentary.comment('unknown digest algorithm:', field, value)
+            commentary.comment('Unknown digest algorithm:', field, value)
     if not re.search(token_re, digest):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
@@ -398,14 +401,14 @@ def validate_ip(field, value, record, version, commentary, pending):
             value = unicode(value)
         ipaddress.ip_address(value)
     except ValueError:
-        commentary.error('invalid ip:', field, value)
+        commentary.error('Invalid ip:', field, value)
     except (ImportError, NameError):  # pragma: no cover
-        commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
+        commentary.comment('Did not check ip address format, install ipaddress module from pypi if you care')
 
 
 def validate_truncated(field, value, record, version, commentary, pending):
     if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}:
-        commentary.comment('unknown value, perhaps an extension:', field, value)
+        commentary.comment('Unknown value, perhaps an extension:', field, value)
 
 
 def validate_warcinfo_id(field, value, record, version, commentary, pending):
@@ -440,7 +443,7 @@ def validate_profile(field, value, record, version, commentary, pending):
         if profiles_rev[value] != version:
             commentary.comment('WARC-Profile value is for a different version:', version, value)
     else:
-        commentary.comment('unknown value, perhaps an extension:', field, value)
+        commentary.comment('Unknown value, perhaps an extension:', field, value)
 
     if '/revisit/uri-agnostic-identical-payload-digest' in value:
         commentary.comment('This Heretrix extension never made it into the standard:', field, value)
@@ -448,26 +451,26 @@ def validate_profile(field, value, record, version, commentary, pending):
 
 def validate_segment_number(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer:', field, value)
+        commentary.error('Must be an integer:', field, value)
         return
     iv = int(value)
     if iv == 0:
-        commentary.error('must be 1 or greater:', field, value)
+        commentary.error('Must be 1 or greater:', field, value)
 
     rec_type = record.rec_headers.get_header('WARC-Type', 'none')
     if rec_type != 'continuation':
         if iv != 1:
-            commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value)
+            commentary.error('Non-continuation records must always have WARC-Segment-Number: 1:', field, value)
         origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID')
         if origin_id is None:
-            commentary.error('segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID')
+            commentary.error('Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID')
     if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
-        commentary.recommendation('do not segment WARC-Type', rec_type)
+        commentary.recommendation('Do not segment WARC-Type', rec_type)
 
 
 def validate_segment_total_length(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer:', field, value)
+        commentary.error('Must be an integer:', field, value)
 
 
 def validate_refers_to_filename(field, value, record, version, commentary, pending):
@@ -525,6 +528,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe
         'validate': validate_profile,
     },
     'WARC-Identified-Payload-Type': {
+        # see also https://github.com/iipc/warc-specifications/issues/49 -- odd that it's allowed for request, revisit, continuation
         'validate': validate_content_type,
     },
     'WARC-Segment-Origin-ID': {
@@ -565,7 +569,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'Content-Type', 'WARC-Target-URI'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_response,
     },
@@ -605,7 +609,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe
     },
     'conversion': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'],
-        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_conversion,
     },
@@ -613,7 +617,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'],
         'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'],
-        'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile', 'WARC-Identified-Payload-Type'],
         'validate': validate_continuation,
     },
 }
@@ -629,17 +633,17 @@ def make_header_set(config, kinds):
 def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False):
     for req in sorted(config.get('required', [])):
         if not rec_headers.get_header(req):
-            commentary.error('missing required header:', req)
+            commentary.error('Missing required header:', req)
     for rec in sorted(config.get('recommended', [])):
         if not rec_headers.get_header(rec):
-            commentary.recommendation('missing recommended header:', rec)
+            commentary.recommendation('Missing recommended header:', rec)
     allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored'))
     prohibited = make_header_set(config, ('prohibited',))
 
     for field, value in rec_headers.headers:  # XXX not exported
         fl = field.lower()
         if fl in prohibited:
-            commentary.error('field not allowed in record type:', rec_type, field)
+            commentary.error('Field not allowed in record type:', rec_type, field)
         elif allow_all or fl in allowed:
             pass
         elif fl in warc_fields:  # pragma: no cover (this is a tester.py configuration omission)
@@ -666,15 +670,15 @@ def validate_record(record):
     for field, value in record.rec_headers.headers:  # XXX not exported
         field_l = field.lower()
         if field_l != 'warc-concurrent-to' and field_l in seen_fields:
-            commentary.error('duplicate field seen:', field, value)
+            commentary.error('Duplicate field seen:', field, value)
         seen_fields.add(field_l)
         if field_l not in warc_fields:
-            commentary.comment('unknown field, no validation performed:', field, value)
+            commentary.comment('Unknown field, no validation performed:', field, value)
             continue
         config = warc_fields[field_l]
         if 'minver' in config:
             if version < config['minver']:
-                commentary.comment('field was introduced after this warc version:', version, field, value)
+                commentary.comment('Field was introduced after this warc version:', version, field, value)
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 
@@ -780,13 +784,13 @@ def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, ta
         return
 
     if target_field.lower() not in all_records[wanted_id]:
-        commentary.comment('revisit target lacks field:', wanted_id, target_field)
+        commentary.comment('Revisit target lacks field:', wanted_id, target_field)
         return
 
     source_value = fields[source_field.lower()]
     target_value = all_records[wanted_id][target_field.lower()]
     if source_value != target_value:
-        commentary.comment('revisit and revisit target disagree:',
+        commentary.comment('Revisit and revisit target disagree:',
                            record_id, source_field, source_value,
                            wanted_id, target_field, target_value)
 
@@ -827,7 +831,7 @@ def check_global_segment(all_records):
     pass
 
 
-def _process_one(warcfile, all_records, concurrent_to):
+def _process_one(warcfile, all_records, concurrent_to, verbose):
     if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'):
         return
     with open(warcfile, 'rb') as stream:
@@ -842,7 +846,7 @@ def _process_one(warcfile, all_records, concurrent_to):
 
             record.stream_for_digest_check()
 
-            if commentary.has_comments() or record.digest_checker.passed is False:
+            if verbose or commentary.has_comments() or record.digest_checker.passed is False:
                 print(' ', 'WARC-Record-ID', commentary.record_id())
                 print('   ', 'WARC-Type', commentary.rec_type())
 
@@ -865,6 +869,7 @@ def _process_one(warcfile, all_records, concurrent_to):
 class Tester(object):
     def __init__(self, cmd):
         self.inputs = cmd.inputs
+        self.verbose = cmd.verbose
         self.exit_value = 0
         self.all_records = defaultdict(dict)
         self.concurrent_to = defaultdict(list)
@@ -875,12 +880,12 @@ def process_all(self):
             try:
                 self.process_one(warcfile)
             except ArchiveLoadFailed as e:
-                print('  saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr)
-                print('  skipping rest of file', file=sys.stderr)
+                print('  saw exception ArchiveLoadFailed: '+str(e).rstrip())
+                print('  skipping rest of file')
 
         check_global(self.all_records, self.concurrent_to)
 
         return self.exit_value
 
     def process_one(self, warcfile):
-        _process_one(warcfile, self.all_records, self.concurrent_to)
+        _process_one(warcfile, self.all_records, self.concurrent_to, self.verbose)

From 5b716b7fbf7fc26b5d11171c97f3e2f98f9f100f Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Thu, 31 Jan 2019 21:49:41 -0800
Subject: [PATCH 25/68] ...

---
 test/test_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index ebbdb509..9c3c9fec 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -342,7 +342,7 @@ def test_leftovers():
     warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
 
     expected = '''\
-error: must be an integer: Content-Length not-an-integer
+error: Must be an integer: Content-Length not-an-integer
 '''
 
     assert '\n'.join(commentary.comments())+'\n' == expected

From fb8e3faa9556df79b26024baedb5533d2089c314 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 1 Feb 2019 10:25:40 -0800
Subject: [PATCH 26/68] revisits and global detection with just one file

---
 warcio/tester.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index 9605ea7b..68f108b2 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -722,7 +722,10 @@ def save_global_info(record, warcfile, commentary, all_records, concurrent_to):
             save['warc-concurrent-to'].append(value)
 
     if record_id in all_records:
-        commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile'])
+        if warcfile != all_records[record_id]['warcfile']:
+            commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile'])
+        else:
+            commentary.error('Duplicate WARC-Record-ID:', record_id)
     else:
         all_records[record_id] = save
 
@@ -853,9 +856,12 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
                 if record.digest_checker.passed is True:
                     print('    digest pass')
                 elif record.digest_checker.passed is None:
-                    if digest_present:  # pragma: no cover
-                        # WARC record missing Content-Length: header, which is verboten
-                        print('    digest present but not checked')
+                    if digest_present:
+                        if commentary.rec_type() == 'revisit':
+                            print('    digest present but not checked (revisit)')
+                        else:  # pragma: no cover
+                            # WARC record missing Content-Length: header, which is verboten
+                            print('    digest present but not checked')
                     else:
                         print('    digest not present')
                 for p in record.digest_checker.problems:

From d2436323fb568fb8729e2d7d3404ebbe8ce29b60 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 1 Feb 2019 15:47:01 -0800
Subject: [PATCH 27/68] show errors for decompression and unchunking failures

---
 test/test_tests.py        | 14 +++++++-------
 warcio/archiveiterator.py |  5 +++--
 warcio/bufferedreaders.py | 17 ++++++++++++++---
 warcio/recordloader.py    | 10 ++++++----
 warcio/tester.py          | 12 ++++++++++--
 5 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index 9c3c9fec..200df8ae 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -297,17 +297,17 @@ def test_digests():
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
 test/data/example.warc
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
@@ -316,11 +316,11 @@ def test_digests():
     error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example.warc test/data/example-digest-bad.warc
   WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
     WARC-Type revisit
-    digest present but not checked
-    recommendation: missing recommended header: WARC-Refers-To
+    digest present but not checked (revisit)
+    recommendation: Missing recommended header: WARC-Refers-To
     comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest
-    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
-    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
   WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest not present
diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py
index 0d1fe2dd..176acb1c 100644
--- a/warcio/archiveiterator.py
+++ b/warcio/archiveiterator.py
@@ -43,13 +43,14 @@ class ArchiveIterator(six.Iterator):
     def __init__(self, fileobj, no_record_parse=False,
                  verify_http=False, arc2warc=False,
                  ensure_http_headers=False, block_size=BUFF_SIZE,
-                 check_digests=False, fixup_bugs=True):
+                 check_digests=False, fixup_bugs=True, raise_exceptions=False):
 
         self.fh = fileobj
 
         self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                           arc2warc=arc2warc,
-                                          fixup_bugs=fixup_bugs)
+                                          fixup_bugs=fixup_bugs,
+                                          raise_exceptions=raise_exceptions)
         self.known_format = None
 
         self.mixed_arc_warc = arc2warc
diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py
index 5b11522b..4ad52f6d 100644
--- a/warcio/bufferedreaders.py
+++ b/warcio/bufferedreaders.py
@@ -36,6 +36,13 @@ def brotli_decompressor():
         pass
 
 
+#=================================================================
+class DecompressionException(Exception):
+    def __init__(self, msg, data=b''):
+        Exception.__init__(self, msg)
+        self.data = data
+
+
 #=================================================================
 class BufferedReader(object):
     """
@@ -64,7 +71,8 @@ class BufferedReader(object):
     def __init__(self, stream, block_size=BUFF_SIZE,
                  decomp_type=None,
                  starting_data=None,
-                 read_all_members=False):
+                 read_all_members=False,
+                 raise_exceptions=False):
 
         self.stream = stream
         self.block_size = block_size
@@ -77,6 +85,7 @@ def __init__(self, stream, block_size=BUFF_SIZE,
         self.buff_size = 0
 
         self.read_all_members = read_all_members
+        self.raise_exceptions = raise_exceptions
 
     def set_decomp(self, decomp_type):
         self._init_decomp(decomp_type)
@@ -142,6 +151,8 @@ def _decompress(self, data):
                         self._init_decomp('deflate_alt')
                         data = self._decompress(data)
                     else:
+                        if self.raise_exceptions:
+                            raise DecompressionException(str(e))
                         self.decompressor = None
                 # otherwise (partly decompressed), something is wrong
                 else:
@@ -280,13 +291,13 @@ class ChunkedDataReader(BufferedReader):
     If at any point the chunked header is not available, the stream is
     assumed to not be chunked and no more dechunking occurs.
     """
-    def __init__(self, stream, raise_exceptions=False, **kwargs):
+    def __init__(self, stream, **kwargs):
         super(ChunkedDataReader, self).__init__(stream, **kwargs)
         self.all_chunks_read = False
         self.not_chunked = False
 
         # if False, we'll use best-guess fallback for parse errors
-        self.raise_chunked_data_exceptions = raise_exceptions
+        self.raise_chunked_data_exceptions = kwargs.get('raise_exceptions')
 
     def _fillbuff(self, block_size=None):
         if self.not_chunked:
diff --git a/warcio/recordloader.py b/warcio/recordloader.py
index 1f17d1f0..d00e8642 100644
--- a/warcio/recordloader.py
+++ b/warcio/recordloader.py
@@ -20,6 +20,7 @@ def __init__(self, *args, **kwargs):
          self.http_headers, self.content_type, self.length) = args
         self.payload_length = -1
         self.digest_checker = kwargs.get('digest_checker')
+        self.raise_exceptions = kwargs.get('raise_exceptions')
 
     def content_stream(self):
         if not self.http_headers:
@@ -34,9 +35,9 @@ def content_stream(self):
                 encoding = None
 
         if self.http_headers.get_header('transfer-encoding') == 'chunked':
-            return ChunkedDataReader(self.raw_stream, decomp_type=encoding)
+            return ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
         elif encoding:
-            return BufferedReader(self.raw_stream, decomp_type=encoding)
+            return BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
         else:
             return self.raw_stream
 
@@ -55,7 +56,7 @@ class ArcWarcRecordLoader(object):
     NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
     HTTP_SCHEMES = ('http:', 'https:')
 
-    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True):
+    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_exceptions=False):
         if arc2warc:
             self.arc_parser = ARC2WARCHeadersParser()
         else:
@@ -66,6 +67,7 @@ def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True):
 
         self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
         self.fixup_bugs = fixup_bugs
+        self.raise_exceptions = raise_exceptions
 
     def parse_record_stream(self, stream,
                             statusline=None,
@@ -147,7 +149,7 @@ def parse_record_stream(self, stream,
 
         return ArcWarcRecord(the_format, rec_type,
                              rec_headers, stream, http_headers,
-                             content_type, length, digest_checker=digest_checker)
+                             content_type, length, digest_checker=digest_checker, raise_exceptions=self.raise_exceptions)
 
     def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None):
         payload_digest = rec_headers.get_header('WARC-Payload-Digest')
diff --git a/warcio/tester.py b/warcio/tester.py
index 68f108b2..84167c4c 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -8,6 +8,7 @@
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
 from warcio.exceptions import ArchiveLoadFailed
+from warcio.bufferedreaders import ChunkedDataException, DecompressionException
 
 
 class Commentary(object):
@@ -838,7 +839,7 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
     if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'):
         return
     with open(warcfile, 'rb') as stream:
-        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
+        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True):
 
             record = WrapRecord(record)
             digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
@@ -847,7 +848,14 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
             commentary = validate_record(record)
             save_global_info(record, warcfile, commentary, all_records, concurrent_to)
 
-            record.stream_for_digest_check()
+            try:
+                record.stream_for_digest_check()
+            except ChunkedDataException:
+                commentary.error('Transfer-Encoding: chunked, saw an error attempting to unchunk')
+                pass
+            except DecompressionException as e:
+                commentary.error('Content-Encoding indicates compression, saw an error attempting to decompress: '+str(e))
+                pass
 
             if verbose or commentary.has_comments() or record.digest_checker.passed is False:
                 print(' ', 'WARC-Record-ID', commentary.record_id())

From 29517c41697e011a37e210a93f1e464f3a934bc6 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 1 Feb 2019 22:13:07 -0800
Subject: [PATCH 28/68] make this function reentrant

---
 warcio/recordloader.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/warcio/recordloader.py b/warcio/recordloader.py
index d00e8642..f8a47db4 100644
--- a/warcio/recordloader.py
+++ b/warcio/recordloader.py
@@ -21,11 +21,15 @@ def __init__(self, *args, **kwargs):
         self.payload_length = -1
         self.digest_checker = kwargs.get('digest_checker')
         self.raise_exceptions = kwargs.get('raise_exceptions')
+        self._content_stream = None
 
     def content_stream(self):
         if not self.http_headers:
             return self.raw_stream
 
+        if self._content_stream:
+            return self._content_stream
+
         encoding = self.http_headers.get_header('content-encoding')
 
         if encoding:
@@ -35,11 +39,13 @@ def content_stream(self):
                 encoding = None
 
         if self.http_headers.get_header('transfer-encoding') == 'chunked':
-            return ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
+            self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
         elif encoding:
-            return BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
+            self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
         else:
-            return self.raw_stream
+            self._content_stream = self.raw_stream
+
+        return self._content_stream
 
 
 #=================================================================

From 844807e63d6a0a98b75080e63b8a5192b764aecc Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 1 Feb 2019 22:13:26 -0800
Subject: [PATCH 29/68] narrow exception; fix bug not reading to the end of a
 chunked buffer

---
 warcio/bufferedreaders.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py
index 4ad52f6d..97325b7d 100644
--- a/warcio/bufferedreaders.py
+++ b/warcio/bufferedreaders.py
@@ -38,9 +38,8 @@ def brotli_decompressor():
 
 #=================================================================
 class DecompressionException(Exception):
-    def __init__(self, msg, data=b''):
+    def __init__(self, msg):
         Exception.__init__(self, msg)
-        self.data = data
 
 
 #=================================================================
@@ -144,7 +143,7 @@ def _decompress(self, data):
         if self.decompressor and data:
             try:
                 data = self.decompressor.decompress(data)
-            except Exception as e:
+            except zlib.error as e:
                 # if first read attempt, assume non-gzipped stream
                 if self.num_block_read == 0:
                     if self.decomp_type == 'deflate':
@@ -342,6 +341,8 @@ def _try_decode(self, length_header):
 
         if not chunk_size:
             # chunk_size 0 indicates end of file
+            final_data = self.stream.read(2)
+            assert(final_data == b'\r\n')
             self.all_chunks_read = True
             self._process_read(b'')
             return

From a55afd311512ca11b96be25f94043ae26fddaf47 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 2 Feb 2019 09:30:51 -0800
Subject: [PATCH 30/68] ...

---
 warcio/tester.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index 84167c4c..84ea75c3 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -840,6 +840,7 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
         return
     with open(warcfile, 'rb') as stream:
         for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True):
+        #for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
 
             record = WrapRecord(record)
             digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
@@ -850,11 +851,11 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
 
             try:
                 record.stream_for_digest_check()
-            except ChunkedDataException:
-                commentary.error('Transfer-Encoding: chunked, saw an error attempting to unchunk')
+            except ChunkedDataException as e:
+                commentary.comment('Transfer-Encoding: chunked, saw exception: '+str(e))
                 pass
             except DecompressionException as e:
-                commentary.error('Content-Encoding indicates compression, saw an error attempting to decompress: '+str(e))
+                commentary.comment('Content-Encoding indicates compression, saw: '+str(e))
                 pass
 
             if verbose or commentary.has_comments() or record.digest_checker.passed is False:

From a33a5eb104e969ee8af3abbe2df0a925e641608a Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Wed, 6 Feb 2019 11:53:02 -0800
Subject: [PATCH 31/68] put tester output in external files

---
 test/data/example-digest-bad.warc.test        |  22 ++
 test/data/example.warc.test                   |  16 +
 .../standard-torture-validate-field.warc.test |  80 ++++
 ...standard-torture-validate-record.warc.test | 112 ++++++
 test/test_tester.py                           |  96 +++++
 test/test_tests.py                            | 348 ------------------
 6 files changed, 326 insertions(+), 348 deletions(-)
 create mode 100644 test/data/example-digest-bad.warc.test
 create mode 100644 test/data/example.warc.test
 create mode 100644 test/data/standard-torture-validate-field.warc.test
 create mode 100644 test/data/standard-torture-validate-record.warc.test
 create mode 100644 test/test_tester.py
 delete mode 100644 test/test_tests.py

diff --git a/test/data/example-digest-bad.warc.test b/test/data/example-digest-bad.warc.test
new file mode 100644
index 00000000..15a5efaf
--- /dev/null
+++ b/test/data/example-digest-bad.warc.test
@@ -0,0 +1,22 @@
+test/data/example-digest-bad.warc
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+global Concurrent-To checks
+  comment: WARC-Concurrent-To not found: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> WARC-Concurrent-To <urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>
diff --git a/test/data/example.warc.test b/test/data/example.warc.test
new file mode 100644
index 00000000..52b3c79f
--- /dev/null
+++ b/test/data/example.warc.test
@@ -0,0 +1,16 @@
+test/data/example.warc
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest not present
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
+    WARC-Type revisit
+    digest present but not checked (revisit)
+    recommendation: Missing recommended header: WARC-Refers-To
+    comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
+  WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest not present
+    error: WARC-IP-Address should be used for http and https requests
diff --git a/test/data/standard-torture-validate-field.warc.test b/test/data/standard-torture-validate-field.warc.test
new file mode 100644
index 00000000..de2e3fe1
--- /dev/null
+++ b/test/data/standard-torture-validate-field.warc.test
@@ -0,0 +1,80 @@
+test/data/standard-torture-validate-field.warc
+  WARC-Record-ID <urn:uuid:torture-validate-field>
+    WARC-Type does-not-exist
+    unknown hash algorithm name in block digest
+    error: uri must not be within <>: WARC-Target-URI <http://example.com/>
+    error: Duplicate field seen: WARC-Target-URI example.com
+    error: Invalid uri, no scheme: WARC-Target-URI example.com
+    error: Duplicate field seen: WARC-Target-URI ex ample.com
+    error: Invalid uri, no scheme: WARC-Target-URI ex ample.com
+    error: Invalid uri, contains whitespace: WARC-Target-URI ex ample.com
+    error: Duplicate field seen: WARC-Target-URI h<>ttp://example.com/
+    error: Invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/
+    error: Duplicate field seen: WARC-Type CAPITALIZED
+    error: uri must be within <>: WARC-Concurrent-To http://example.com/
+    error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
+    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
+    error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
+    error: Must contain a /: Content-Type asdf
+    error: Invalid subtype: Content-Type asdf
+    error: Duplicate field seen: Content-Type has space/asdf
+    error: Invalid type: Content-Type has space/asdf
+    error: Duplicate field seen: Content-Type asdf/has space
+    error: Invalid subtype: Content-Type asdf/has space
+    error: Duplicate field seen: Content-Type asdf/has space;asdf
+    error: Invalid subtype: Content-Type asdf/has space;asdf
+    error: Missing algorithm: WARC-Block-Digest asdf
+    error: Duplicate field seen: WARC-Block-Digest has space:asdf
+    error: Invalid algorithm: WARC-Block-Digest has space:asdf
+    error: Duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^
+    error: Invalid ip: WARC-IP-Address 1.2.3.4.5
+    error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf
+    error: Duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    error: Must contain a /: WARC-Identified-Payload-Type asdf
+    error: Invalid subtype: WARC-Identified-Payload-Type asdf
+    error: uri must be within <>: WARC-Segment-Origin-ID http://example.com
+    error: Must be an integer: WARC-Segment-Number not-an-integer
+    error: Duplicate field seen: WARC-Segment-Number 0
+    error: Must be 1 or greater: WARC-Segment-Number 0
+    error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0
+    error: Duplicate field seen: WARC-Segment-Number 1
+    error: Duplicate field seen: WARC-Segment-Number 2
+    error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2
+    error: Duplicate field seen: WARC-Segment-Total-Length not-an-integer
+    error: Must be an integer: WARC-Segment-Total-Length not-an-integer
+    error: Invalid timestamp: WARC-Refers-To-Date not-a-date
+    comment: Unknown WARC-Type: WARC-Type does-not-exist
+    comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED
+    comment: Unknown WARC-Type: WARC-Type CAPITALIZED
+    comment: Unknown digest algorithm: WARC-Block-Digest asdf
+    comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^
+    comment: Unknown value, perhaps an extension: WARC-Truncated invalid
+    comment: Unknown value, perhaps an extension: WARC-Profile asdf
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date
+    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf
+    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234
+    comment: Unknown field, no validation performed: WARC-Unknown-Field asdf
+  WARC-Record-ID None
+    WARC-Type invalid
+    digest not present
+    error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
+    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
+    error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z
+    comment: Unknown WARC-Type: WARC-Type invalid
+  WARC-Record-ID None
+    WARC-Type request
+    digest not present
+    error: Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Record-ID
+    error: Missing required header: WARC-Target-URI
+    recommendation: Do not segment WARC-Type request
+  saw exception ArchiveLoadFailed: Invalid WARC record, first line: WARC/invalid
+  skipping rest of file
+global warcinfo checks
+  comment: WARC-Warcinfo-ID not found: <urn:uuid:torture-validate-field> WARC-Warcinfo-ID asdf:asdf
+global Concurrent-To checks
+  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To <uri:urn:asdf-asdf-asdf>
+  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To http://example.com/
diff --git a/test/data/standard-torture-validate-record.warc.test b/test/data/standard-torture-validate-record.warc.test
new file mode 100644
index 00000000..e7b17345
--- /dev/null
+++ b/test/data/standard-torture-validate-record.warc.test
@@ -0,0 +1,112 @@
+test/data/standard-torture-validate-record.warc
+  WARC-Record-ID None
+    WARC-Type warcinfo
+    digest not present
+    error: uri must be within <>: WARC-Refers-To probhibited
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Record-ID
+    error: Field not allowed in record type: warcinfo WARC-Refers-To
+    error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
+    comment: The first line of warc-fields cannot start with whitespace
+    comment: warc-fields lines must end with \r\n: test: lines should end with \r\n
+    comment: Missing colon in warc-fields line: no colon
+    comment: Invalid warc-fields name: token cannot have a space
+  WARC-Record-ID <uri:uuid:test-empty-warc-fields>
+    WARC-Type warcinfo
+    digest not present
+    error: Missing required header: WARC-Date
+    comment: warc-fields block present but empty
+  WARC-Record-ID <uri:uuid:test-warcinfo-non-recommended-content-type>
+    WARC-Type warcinfo
+    digest not present
+    error: Missing required header: WARC-Date
+    recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields
+  WARC-Record-ID <uri:uuid:test-response-content-type>
+    WARC-Type response
+    digest not present
+    error: Missing required header: WARC-Date
+    error: Responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain
+    error: WARC-IP-Address should be used for http and https responses
+    error: http/https responses should have http headers
+  WARC-Record-ID <uri:uuid:test-resource-dns-content-type>
+    WARC-Type resource
+    digest not present
+    error: Missing required header: WARC-Date
+    error: resource records for dns shall have Content-Type of text/dns: text/plain
+  WARC-Record-ID <uri:uuid:test-resource-dns-empty>
+    WARC-Type resource
+    digest not present
+    error: Missing required header: WARC-Date
+    comment: Unknown field, no validation performed: WARC-Test-TODO add another with valid block
+  WARC-Record-ID <uri:uuid:test-resource-not-dns>
+    WARC-Type resource
+    digest not present
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+  WARC-Record-ID <uri:uuid:test-request-content-type>
+    WARC-Type request
+    digest not present
+    error: Missing required header: WARC-Date
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <uri:uuid:test-request-content-type-with-ip>
+    WARC-Type request
+    digest not present
+    error: Missing required header: WARC-Date
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
+  WARC-Record-ID <uri:uuid:test-metadata-warc-fields-empty>
+    WARC-Type metadata
+    digest not present
+    error: Missing required header: WARC-Date
+    comment: warc-fields block present but empty
+  WARC-Record-ID <uri:uuid:test-metadata-not-warc-fields>
+    WARC-Type metadata
+    digest not present
+    error: Missing required header: WARC-Date
+  WARC-Record-ID <uri:uuid:test-revisit-profile-unknown>
+    WARC-Type revisit
+    digest not present
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Target-URI
+    comment: Unknown value, perhaps an extension: WARC-Profile none
+    comment: No revisit details validation done due to unknown profile: none
+  WARC-Record-ID <uri:uuid:test-revisit-profile-future>
+    WARC-Type revisit
+    digest not present
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Target-URI
+    error: Missing required header: WARC-Payload-Digest
+    recommendation: Missing recommended header: WARC-Refers-To
+    recommendation: Missing recommended header: WARC-Refers-To-Date
+    recommendation: Missing recommended header: WARC-Refers-To-Target-URI
+    comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+  WARC-Record-ID <uri:uuid:test-revisit-profile-good>
+    WARC-Type revisit
+    digest not present
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Target-URI
+    recommendation: Missing recommended header: WARC-Refers-To
+    recommendation: Missing recommended header: WARC-Refers-To-Date
+  WARC-Record-ID <uri:uuid:test-conversion>
+    WARC-Type conversion
+    digest not present
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Target-URI
+  WARC-Record-ID <uri:uuid:test-continuation-segment-1>
+    WARC-Type continuation
+    digest not present
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Segment-Origin-ID
+    error: Missing required header: WARC-Target-URI
+    error: continuation record must have WARC-Segment-Number > 1: 1
+    comment: warcio test continuation code has not been tested, expect bugs
+  WARC-Record-ID <uri:uuid:test-continuation-segment-valid>
+    WARC-Type continuation
+    digest not present
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Segment-Origin-ID
+    error: Missing required header: WARC-Target-URI
+    comment: warcio test continuation code has not been tested, expect bugs
diff --git a/test/test_tester.py b/test/test_tester.py
new file mode 100644
index 00000000..49b1cc6d
--- /dev/null
+++ b/test/test_tester.py
@@ -0,0 +1,96 @@
+from warcio.cli import main
+from warcio.utils import to_native_str
+import warcio.tester
+
+from . import get_test_file
+from .test_cli import patch_stdout
+
+
+file_map = {}
+
+
+def map_test_file(filename):
+    file_map[filename] = get_test_file(filename)
+    return file_map[filename]
+
+
+def helper(args, expected_exit_value):
+    with patch_stdout() as buff:
+        exit_value = None
+        try:
+            main(args=args)
+        except SystemExit as e:
+            exit_value = e.code
+        finally:
+            assert exit_value == expected_exit_value
+
+        return to_native_str(buff.getvalue())
+
+
+def remove_before_test_data(s):
+    ret = ''
+    for line in s.splitlines(True):
+        for filename, value in file_map.items():
+            if value in line:
+                line = line.replace(value, 'test/data/' + filename)
+        ret += line
+    return ret
+
+
+def run_one(f):
+    args = ['test']
+    args.append(f)
+
+    with open(f+'.test', 'r') as expectedf:
+        expected = expectedf.read()
+
+    value = helper(args, 0)
+    print(remove_before_test_data(value))
+
+    actual = remove_before_test_data(value)
+
+    assert actual == expected
+
+
+def test_torture():
+    files = ['standard-torture-validate-record.warc',
+             'standard-torture-validate-field.warc']
+    [run_one(map_test_file(filename)) for filename in files]
+
+
+def test_arc():
+    files = ['does-not-exist.arc']
+    files = [map_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = """\
+test/data/does-not-exist.arc
+"""
+
+    value = helper(args, 0)
+    assert remove_before_test_data(value) == expected
+
+
+def test_digests():
+    # needed for test coverage
+    files = ['example-digest-bad.warc', 'example.warc']
+    [run_one(map_test_file(filename)) for filename in files]
+
+
+def test_leftovers():
+    commentary = warcio.tester.Commentary('id', 'type')
+    assert not commentary.has_comments()
+
+    # hard to test because invalid WARC Content-Length raises in archiveiterator
+    warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None)
+
+    # hard to test because warcio raises for unknown WARC version
+    warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
+
+    expected = '''\
+error: Must be an integer: Content-Length not-an-integer
+'''
+
+    assert '\n'.join(commentary.comments())+'\n' == expected
diff --git a/test/test_tests.py b/test/test_tests.py
deleted file mode 100644
index 200df8ae..00000000
--- a/test/test_tests.py
+++ /dev/null
@@ -1,348 +0,0 @@
-from warcio.cli import main
-from warcio.utils import to_native_str
-import warcio.tester
-
-from . import get_test_file
-from .test_cli import patch_stdout
-
-
-file_map = {}
-
-
-def map_test_file(filename):
-    file_map[filename] = get_test_file(filename)
-    return file_map[filename]
-
-
-def helper(args, expected_exit_value):
-    with patch_stdout() as buff:
-        exit_value = None
-        try:
-            main(args=args)
-        except SystemExit as e:
-            exit_value = e.code
-        finally:
-            assert exit_value == expected_exit_value
-
-        return to_native_str(buff.getvalue())
-
-
-def remove_before_test_data(s):
-    ret = ''
-    for line in s.splitlines(True):
-        for filename, value in file_map.items():
-            if value in line:
-                line = line.replace(value, 'test/data/' + filename)
-        ret += line
-    return ret
-
-
-def test_torture_validate_record():
-    files = ['standard-torture-validate-record.warc']
-    files = [map_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/standard-torture-validate-record.warc
-  WARC-Record-ID None
-    WARC-Type warcinfo
-    digest not present
-    error: uri must be within <>: WARC-Refers-To probhibited
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Record-ID
-    error: field not allowed in record type: warcinfo WARC-Refers-To
-    error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
-    comment: The first line of warc-fields cannot start with whitespace
-    comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
-    comment: Missing colon in warc-fields line: no colon
-    comment: Invalid warc-fields name: token cannot have a space
-  WARC-Record-ID <uri:uuid:test-empty-warc-fields>
-    WARC-Type warcinfo
-    digest not present
-    error: missing required header: WARC-Date
-    comment: warc-fields block present but empty
-  WARC-Record-ID <uri:uuid:test-warcinfo-non-recommended-content-type>
-    WARC-Type warcinfo
-    digest not present
-    error: missing required header: WARC-Date
-    recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields
-  WARC-Record-ID <uri:uuid:test-response-content-type>
-    WARC-Type response
-    digest not present
-    error: missing required header: WARC-Date
-    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain
-    error: WARC-IP-Address should be used for http and https responses
-    error: http/https responses should have http headers
-  WARC-Record-ID <uri:uuid:test-resource-dns-content-type>
-    WARC-Type resource
-    digest not present
-    error: missing required header: WARC-Date
-    error: resource records for dns shall have Content-Type of text/dns: text/plain
-  WARC-Record-ID <uri:uuid:test-resource-dns-empty>
-    WARC-Type resource
-    digest not present
-    error: missing required header: WARC-Date
-    comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block
-  WARC-Record-ID <uri:uuid:test-resource-not-dns>
-    WARC-Type resource
-    digest not present
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-  WARC-Record-ID <uri:uuid:test-request-content-type>
-    WARC-Type request
-    digest not present
-    error: missing required header: WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
-    error: WARC-IP-Address should be used for http and https requests
-  WARC-Record-ID <uri:uuid:test-request-content-type-with-ip>
-    WARC-Type request
-    digest not present
-    error: missing required header: WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
-  WARC-Record-ID <uri:uuid:test-metadata-warc-fields-empty>
-    WARC-Type metadata
-    digest not present
-    error: missing required header: WARC-Date
-    comment: warc-fields block present but empty
-  WARC-Record-ID <uri:uuid:test-metadata-not-warc-fields>
-    WARC-Type metadata
-    digest not present
-    error: missing required header: WARC-Date
-  WARC-Record-ID <uri:uuid:test-revisit-profile-unknown>
-    WARC-Type revisit
-    digest not present
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Target-URI
-    comment: unknown value, perhaps an extension: WARC-Profile none
-    comment: no revisit details validation done due to unknown profile: none
-  WARC-Record-ID <uri:uuid:test-revisit-profile-future>
-    WARC-Type revisit
-    digest not present
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Target-URI
-    error: missing required header: WARC-Payload-Digest
-    recommendation: missing recommended header: WARC-Refers-To
-    recommendation: missing recommended header: WARC-Refers-To-Date
-    recommendation: missing recommended header: WARC-Refers-To-Target-URI
-    comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
-  WARC-Record-ID <uri:uuid:test-revisit-profile-good>
-    WARC-Type revisit
-    digest not present
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Target-URI
-    recommendation: missing recommended header: WARC-Refers-To
-    recommendation: missing recommended header: WARC-Refers-To-Date
-  WARC-Record-ID <uri:uuid:test-conversion>
-    WARC-Type conversion
-    digest not present
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Target-URI
-  WARC-Record-ID <uri:uuid:test-continuation-segment-1>
-    WARC-Type continuation
-    digest not present
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Segment-Origin-ID
-    error: missing required header: WARC-Target-URI
-    error: continuation record must have WARC-Segment-Number > 1: 1
-    comment: warcio test continuation code has not been tested, expect bugs
-  WARC-Record-ID <uri:uuid:test-continuation-segment-valid>
-    WARC-Type continuation
-    digest not present
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Segment-Origin-ID
-    error: missing required header: WARC-Target-URI
-    comment: warcio test continuation code has not been tested, expect bugs
-"""
-
-    value = helper(args, 0)
-    print(remove_before_test_data(value))
-
-    actual = remove_before_test_data(value)
-
-    assert actual == expected
-
-
-def test_torture_validate_field():
-    files = ['standard-torture-validate-field.warc']
-    files = [map_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/standard-torture-validate-field.warc
-  WARC-Record-ID <urn:uuid:torture-validate-field>
-    WARC-Type does-not-exist
-    unknown hash algorithm name in block digest
-    error: uri must not be within <>: WARC-Target-URI <http://example.com/>
-    error: invalid uri scheme, bad character: WARC-Target-URI <http://example.com/>
-    error: duplicate field seen: WARC-Target-URI example.com
-    error: invalid uri, no scheme: WARC-Target-URI example.com
-    error: duplicate field seen: WARC-Target-URI ex ample.com
-    error: invalid uri, no scheme: WARC-Target-URI ex ample.com
-    error: invalid uri, contains whitespace: WARC-Target-URI ex ample.com
-    error: invalid uri scheme, bad character: WARC-Target-URI ex ample.com
-    error: duplicate field seen: WARC-Target-URI h<>ttp://example.com/
-    error: invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/
-    error: duplicate field seen: WARC-Type CAPITALIZED
-    error: uri must be within <>: WARC-Concurrent-To http://example.com/
-    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
-    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
-    error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
-    error: must contain a /: Content-Type asdf
-    error: invalid subtype: Content-Type asdf
-    error: duplicate field seen: Content-Type has space/asdf
-    error: invalid type: Content-Type has space/asdf
-    error: duplicate field seen: Content-Type asdf/has space
-    error: invalid subtype: Content-Type asdf/has space
-    error: duplicate field seen: Content-Type asdf/has space;asdf
-    error: invalid subtype: Content-Type asdf/has space;asdf
-    error: missing algorithm: WARC-Block-Digest asdf
-    error: duplicate field seen: WARC-Block-Digest has space:asdf
-    error: invalid algorithm: WARC-Block-Digest has space:asdf
-    error: duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^
-    error: invalid ip: WARC-IP-Address 1.2.3.4.5
-    error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf
-    error: duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
-    error: must contain a /: WARC-Identified-Payload-Type asdf
-    error: invalid subtype: WARC-Identified-Payload-Type asdf
-    error: uri must be within <>: WARC-Segment-Origin-ID http://example.com
-    error: must be an integer: WARC-Segment-Number not-an-integer
-    error: duplicate field seen: WARC-Segment-Number 0
-    error: must be 1 or greater: WARC-Segment-Number 0
-    error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0
-    error: duplicate field seen: WARC-Segment-Number 1
-    error: duplicate field seen: WARC-Segment-Number 2
-    error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2
-    error: duplicate field seen: WARC-Segment-Total-Length not-an-integer
-    error: must be an integer: WARC-Segment-Total-Length not-an-integer
-    error: Invalid timestamp: WARC-Refers-To-Date not-a-date
-    comment: unknown WARC-Type: WARC-Type does-not-exist
-    comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED
-    comment: unknown WARC-Type: WARC-Type CAPITALIZED
-    comment: unknown digest algorithm: WARC-Block-Digest asdf
-    comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^
-    comment: unknown value, perhaps an extension: WARC-Truncated invalid
-    comment: unknown value, perhaps an extension: WARC-Profile asdf
-    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com
-    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date
-    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf
-    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234
-    comment: unknown field, no validation performed: WARC-Unknown-Field asdf
-  WARC-Record-ID None
-    WARC-Type invalid
-    digest not present
-    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
-    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
-    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z
-    comment: unknown WARC-Type: WARC-Type invalid
-  WARC-Record-ID None
-    WARC-Type request
-    digest not present
-    error: segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Record-ID
-    error: missing required header: WARC-Target-URI
-    recommendation: do not segment WARC-Type request
-global warcinfo checks
-  comment: WARC-Warcinfo-ID not found: <urn:uuid:torture-validate-field> WARC-Warcinfo-ID asdf:asdf
-global Concurrent-To checks
-  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To <uri:urn:asdf-asdf-asdf>
-  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To http://example.com/
-"""
-
-    value = helper(args, 0)
-    actual = remove_before_test_data(value)
-
-    print(actual)
-    assert actual == expected
-
-
-def test_arc():
-    files = ['does-not-exist.arc']
-    files = [map_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/does-not-exist.arc
-"""
-
-    value = helper(args, 0)
-    assert remove_before_test_data(value) == expected
-
-
-def test_digests():
-    # needed for test coverage
-    files = ['example-digest-bad.warc', 'example.warc']
-    files = [map_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/example-digest-bad.warc
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
-    error: WARC-IP-Address should be used for http and https requests
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest pass
-    error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest pass
-    error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest pass
-    error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-test/data/example.warc
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest not present
-    error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example.warc test/data/example-digest-bad.warc
-  WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
-    WARC-Type revisit
-    digest present but not checked (revisit)
-    recommendation: Missing recommended header: WARC-Refers-To
-    comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest
-    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
-    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
-  WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest not present
-    error: WARC-IP-Address should be used for http and https requests
-"""
-
-    value = helper(args, 0)
-    assert remove_before_test_data(value) == expected
-
-
-def test_leftovers():
-    commentary = warcio.tester.Commentary('id', 'type')
-    assert not commentary.has_comments()
-
-    # hard to test because invalid WARC Content-Length raises in archiveiterator
-    warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None)
-
-    # hard to test because warcio raises for unknown WARC version
-    warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
-
-    expected = '''\
-error: Must be an integer: Content-Length not-an-integer
-'''
-
-    assert '\n'.join(commentary.comments())+'\n' == expected

From fec139ac253022895d4b864b3c73832a7c8c9a90 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Thu, 4 Apr 2019 15:01:31 -0700
Subject: [PATCH 32/68] wip

---
 test/test_tester.py       |  2 +-
 warcio/archiveiterator.py |  5 ++-
 warcio/bufferedreaders.py | 49 +++++++++++++------------
 warcio/recordloader.py    | 43 ++++++++++++++++++----
 warcio/tester.py          | 76 +++++++++------------------------------
 5 files changed, 82 insertions(+), 93 deletions(-)

diff --git a/test/test_tester.py b/test/test_tester.py
index 49b1cc6d..08963ea9 100644
--- a/test/test_tester.py
+++ b/test/test_tester.py
@@ -80,7 +80,7 @@ def test_digests():
 
 
 def test_leftovers():
-    commentary = warcio.tester.Commentary('id', 'type')
+    commentary = warcio.recordloader.Commentary()
     assert not commentary.has_comments()
 
     # hard to test because invalid WARC Content-Length raises in archiveiterator
diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py
index 176acb1c..0d1fe2dd 100644
--- a/warcio/archiveiterator.py
+++ b/warcio/archiveiterator.py
@@ -43,14 +43,13 @@ class ArchiveIterator(six.Iterator):
     def __init__(self, fileobj, no_record_parse=False,
                  verify_http=False, arc2warc=False,
                  ensure_http_headers=False, block_size=BUFF_SIZE,
-                 check_digests=False, fixup_bugs=True, raise_exceptions=False):
+                 check_digests=False, fixup_bugs=True):
 
         self.fh = fileobj
 
         self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                           arc2warc=arc2warc,
-                                          fixup_bugs=fixup_bugs,
-                                          raise_exceptions=raise_exceptions)
+                                          fixup_bugs=fixup_bugs)
         self.known_format = None
 
         self.mixed_arc_warc = arc2warc
diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py
index 97325b7d..e07dcd4b 100644
--- a/warcio/bufferedreaders.py
+++ b/warcio/bufferedreaders.py
@@ -36,12 +36,6 @@ def brotli_decompressor():
         pass
 
 
-#=================================================================
-class DecompressionException(Exception):
-    def __init__(self, msg):
-        Exception.__init__(self, msg)
-
-
 #=================================================================
 class BufferedReader(object):
     """
@@ -71,7 +65,7 @@ def __init__(self, stream, block_size=BUFF_SIZE,
                  decomp_type=None,
                  starting_data=None,
                  read_all_members=False,
-                 raise_exceptions=False):
+                 commentary=None):
 
         self.stream = stream
         self.block_size = block_size
@@ -84,7 +78,7 @@ def __init__(self, stream, block_size=BUFF_SIZE,
         self.buff_size = 0
 
         self.read_all_members = read_all_members
-        self.raise_exceptions = raise_exceptions
+        self.commentary = commentary
 
     def set_decomp(self, decomp_type):
         self._init_decomp(decomp_type)
@@ -96,6 +90,10 @@ def _init_decomp(self, decomp_type):
                 self.decomp_type = decomp_type
                 self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
             except KeyError:
+                # XXX don't raise?
+                # we don't know if the enduser cares or not
+                # or the record might actually be uncompressed
+                # XXX what does pywb do
                 raise Exception('Decompression type not supported: ' +
                                 decomp_type)
         else:
@@ -150,8 +148,8 @@ def _decompress(self, data):
                         self._init_decomp('deflate_alt')
                         data = self._decompress(data)
                     else:
-                        if self.raise_exceptions:
-                            raise DecompressionException(str(e))
+                        if self.commentary:
+                            self.commentary.comment('Payload claimed to be compressed but apparently is not')
                         self.decompressor = None
                 # otherwise (partly decompressed), something is wrong
                 else:
@@ -290,40 +288,43 @@ class ChunkedDataReader(BufferedReader):
     If at any point the chunked header is not available, the stream is
     assumed to not be chunked and no more dechunking occurs.
     """
-    def __init__(self, stream, **kwargs):
+    def __init__(self, stream, raise_exceptions=False, commentary=None, **kwargs):
         super(ChunkedDataReader, self).__init__(stream, **kwargs)
         self.all_chunks_read = False
-        self.not_chunked = False
-
-        # if False, we'll use best-guess fallback for parse errors
-        self.raise_chunked_data_exceptions = kwargs.get('raise_exceptions')
+        self.not_actually_chunked = False
+        self.at_start = True
+        self.raise_chunked_data_exceptions = raise_exceptions
+        self.commentary = commentary
 
     def _fillbuff(self, block_size=None):
-        if self.not_chunked:
+        if self.not_actually_chunked:
             return super(ChunkedDataReader, self)._fillbuff(block_size)
 
         # Loop over chunks until there is some data (not empty())
         # In particular, gzipped data may require multiple chunks to
         # return any decompressed result
-        while (self.empty() and
-               not self.all_chunks_read and
-               not self.not_chunked):
-
+        while (self.empty() and not self.all_chunks_read):
             try:
                 length_header = self.stream.readline(64)
                 self._try_decode(length_header)
+                self.at_start = False
             except ChunkedDataException as e:
                 if self.raise_chunked_data_exceptions:
                     raise
-
                 # Can't parse the data as chunked.
                 # It's possible that non-chunked data is served
                 # with a Transfer-Encoding: chunked.
                 # Treat this as non-chunk encoded from here on.
+                if self.commentary:
+                    if self.at_start:
+                        self.commentary.comment('Buffer claimed to be chunked, but was not from the start')
+                    else:
+                        self.commentary.comment('Buffer is chunked but there was an unchunking error midway')
                 self._process_read(length_header + e.data)
-                self.not_chunked = True
+                self.not_actually_chunked = True
+                self.at_start = False
 
-                # parse as block as non-chunked
+                # parse as non-chunked
                 return super(ChunkedDataReader, self)._fillbuff(block_size)
 
     def _try_decode(self, length_header):
@@ -362,6 +363,8 @@ def _try_decode(self, length_header):
                     msg = 'Ran out of data before end of chunk'
                     raise ChunkedDataException(msg, data)
                 else:
+                    if self.commentary:
+                        self.commentary.comment('Chunked reader ran out of data before end of chunk')
                     chunk_size = data_len
                     self.all_chunks_read = True
 
diff --git a/warcio/recordloader.py b/warcio/recordloader.py
index f8a47db4..6629c9e9 100644
--- a/warcio/recordloader.py
+++ b/warcio/recordloader.py
@@ -13,6 +13,36 @@
 from six.moves import zip
 
 
+#=================================================================
+class Commentary(object):
+    def __init__(self):
+        self.errors = []
+        self.recommendations = []
+        self._comments = []
+
+    def error(self, *args):
+        self.errors.append(args)
+
+    def recommendation(self, *args):
+        self.recommendations.append(args)
+
+    def comment(self, *args):
+        self._comments.append(args)
+
+    def has_comments(self):
+        if self.errors or self.recommendations or self._comments:
+            return True
+
+    def comments(self):
+        # XXX str() all of these, in case an int or other thing slips in?
+        for e in self.errors:
+            yield 'error: ' + ' '.join(e)
+        for r in self.recommendations:
+            yield 'recommendation: ' + ' '.join(r)
+        for c in self._comments:
+            yield 'comment: ' + ' '.join(c)
+
+
 #=================================================================
 class ArcWarcRecord(object):
     def __init__(self, *args, **kwargs):
@@ -20,7 +50,7 @@ def __init__(self, *args, **kwargs):
          self.http_headers, self.content_type, self.length) = args
         self.payload_length = -1
         self.digest_checker = kwargs.get('digest_checker')
-        self.raise_exceptions = kwargs.get('raise_exceptions')
+        self.commentary = kwargs.get('commentary')
         self._content_stream = None
 
     def content_stream(self):
@@ -39,9 +69,9 @@ def content_stream(self):
                 encoding = None
 
         if self.http_headers.get_header('transfer-encoding') == 'chunked':
-            self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
+            self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary)
         elif encoding:
-            self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
+            self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary)
         else:
             self._content_stream = self.raw_stream
 
@@ -62,7 +92,7 @@ class ArcWarcRecordLoader(object):
     NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
     HTTP_SCHEMES = ('http:', 'https:')
 
-    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_exceptions=False):
+    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True):
         if arc2warc:
             self.arc_parser = ARC2WARCHeadersParser()
         else:
@@ -73,7 +103,6 @@ def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_excep
 
         self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
         self.fixup_bugs = fixup_bugs
-        self.raise_exceptions = raise_exceptions
 
     def parse_record_stream(self, stream,
                             statusline=None,
@@ -131,6 +160,7 @@ def parse_record_stream(self, stream,
 
         is_verifying = False
         digest_checker = DigestChecker(check_digests)
+        commentary = Commentary()
 
         # limit stream to the length for all valid records
         if length is not None and length >= 0:
@@ -155,7 +185,8 @@ def parse_record_stream(self, stream,
 
         return ArcWarcRecord(the_format, rec_type,
                              rec_headers, stream, http_headers,
-                             content_type, length, digest_checker=digest_checker, raise_exceptions=self.raise_exceptions)
+                             content_type, length, digest_checker=digest_checker,
+                             commentary=commentary)
 
     def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None):
         payload_digest = rec_headers.get_header('WARC-Payload-Digest')
diff --git a/warcio/tester.py b/warcio/tester.py
index 84ea75c3..cee5344f 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -8,45 +8,8 @@
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
 from warcio.exceptions import ArchiveLoadFailed
-from warcio.bufferedreaders import ChunkedDataException, DecompressionException
-
-
-class Commentary(object):
-    def __init__(self, record_id=None, rec_type=None):
-        self._record_id = record_id
-        self._rec_type = rec_type
-        self.errors = []
-        self.recommendations = []
-        self._comments = []
-
-    def record_id(self):
-        return self._record_id
-
-    def rec_type(self):
-        return self._rec_type
-
-    def error(self, *args):
-        self.errors.append(args)
-
-    def recommendation(self, *args):
-        self.recommendations.append(args)
-
-    def comment(self, *args):
-        self._comments.append(args)
-
-    def has_comments(self):
-        if self.errors or self.recommendations or self._comments:
-            return True
-
-    def comments(self):
-        # XXX str() all of these, in case an int or other thing slips in?
-        for e in self.errors:
-            yield 'error: ' + ' '.join(e)
-        for r in self.recommendations:
-            yield 'recommendation: ' + ' '.join(r)
-        for c in self._comments:
-            yield 'comment: ' + ' '.join(c)
-
+from warcio.bufferedreaders import ChunkedDataException
+from warcio.recordloader import Commentary
 
 class WrapRecord(object):
     def __init__(self, obj):
@@ -662,9 +625,7 @@ def validate_record_against_rec_type(config, record, commentary, pending):
 def validate_record(record):
     version = record.rec_headers.protocol.split('/', 1)[1]  # XXX not exported
 
-    record_id = record.rec_headers.get_header('WARC-Record-ID')
-    rec_type = record.rec_headers.get_header('WARC-Type')
-    commentary = Commentary(record_id=record_id, rec_type=rec_type)
+    commentary = record.commentary
     pending = None
 
     seen_fields = set()
@@ -683,6 +644,7 @@ def validate_record(record):
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 
+    rec_type = record.rec_headers.get_header('WARC-Type')
     if rec_type not in record_types:
         # we print a comment for this elsewhere
         pass
@@ -839,37 +801,31 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
     if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'):
         return
     with open(warcfile, 'rb') as stream:
-        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True):
-        #for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
-
+        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
             record = WrapRecord(record)
             digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
                               record.rec_headers.get_header('WARC-Block-Digest'))
+            record_id = record.rec_headers.get_header('WARC-Record-ID')
+            rec_type = record.rec_headers.get_header('WARC-Type')
 
-            commentary = validate_record(record)
-            save_global_info(record, warcfile, commentary, all_records, concurrent_to)
+            validate_record(record)
+            record.stream_for_digest_check()
 
-            try:
-                record.stream_for_digest_check()
-            except ChunkedDataException as e:
-                commentary.comment('Transfer-Encoding: chunked, saw exception: '+str(e))
-                pass
-            except DecompressionException as e:
-                commentary.comment('Content-Encoding indicates compression, saw: '+str(e))
-                pass
+            commentary = record.commentary
+            save_global_info(record, warcfile, commentary, all_records, concurrent_to)
 
             if verbose or commentary.has_comments() or record.digest_checker.passed is False:
-                print(' ', 'WARC-Record-ID', commentary.record_id())
-                print('   ', 'WARC-Type', commentary.rec_type())
-
+                print(' ', 'WARC-Record-ID', record_id)
+                print('   ', 'WARC-Type', rec_type)
                 if record.digest_checker.passed is True:
                     print('    digest pass')
                 elif record.digest_checker.passed is None:
                     if digest_present:
-                        if commentary.rec_type() == 'revisit':
+                        if rec_type == 'revisit':
                             print('    digest present but not checked (revisit)')
                         else:  # pragma: no cover
-                            # WARC record missing Content-Length: header, which is verboten
+                            # should not ever happen
+                            # example reason: WARC record missing Content-Length: header, but that case raises
                             print('    digest present but not checked')
                     else:
                         print('    digest not present')

From a471222c96589e7f5d7e7745aa3a2b72fcf2f2b8 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Thu, 4 Apr 2019 23:35:46 -0700
Subject: [PATCH 33/68] tweak to match new test files

---
 test/test_cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_cli.py b/test/test_cli.py
index 9e356912..103d5d9b 100644
--- a/test/test_cli.py
+++ b/test/test_cli.py
@@ -85,7 +85,7 @@ def test_check_valid():
 
     args = ['check', '-v'] + filenames
     value = check_helper(args, 0)
-    assert value.count(b'digest pass') == 2
+    assert value.count(b'digest pass') == 4
     assert value.count(b'WARC-Record-ID') == 12
 
 

From 30a86fe1f888d38b27855f05807ae48fad5c4c3e Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 9 Sep 2019 11:03:30 -0700
Subject: [PATCH 34/68] tests pass

---
 test/test_check_digest_examples.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_check_digest_examples.py b/test/test_check_digest_examples.py
index 679d7d24..89eb296f 100644
--- a/test/test_check_digest_examples.py
+++ b/test/test_check_digest_examples.py
@@ -9,7 +9,8 @@
         'example-iana.org-chunked.warc',
         'example-wrong-chunks.warc.gz',
         'example-bad-non-chunked.warc.gz',
-        'example-digest.warc'
+        'example-digest-bad.warc',
+        'standard-torture-validate-field.warc',
        ]
 
 
@@ -34,7 +35,7 @@ def check_helper(self, args, expected_exit_value, capsys):
         return capsys.readouterr()[0]  # list for py33 support
 
     def test_check_invalid(self, capsys):
-        filenames = [get_test_file('example-digest.warc')]
+        filenames = [get_test_file('example-digest-bad.warc')]
 
         args = ['check'] + filenames
         value = self.check_helper(args, 1, capsys)

From 19dc8b3e0e67f1384aba18fa56d34456165592d8 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 25 Jan 2019 16:05:38 -0800
Subject: [PATCH 35/68] warcio test

---
 ...le-digest.warc => example-digest-bad.warc} |   0
 test/test_archiveiterator.py                  |  10 +-
 warcio/archiveiterator.py                     |   5 +-
 warcio/cli.py                                 |  12 +
 warcio/recordloader.py                        |  10 +-
 warcio/tester.py                              | 638 ++++++++++++++++++
 6 files changed, 664 insertions(+), 11 deletions(-)
 rename test/data/{example-digest.warc => example-digest-bad.warc} (100%)
 create mode 100644 warcio/tester.py

diff --git a/test/data/example-digest.warc b/test/data/example-digest-bad.warc
similarity index 100%
rename from test/data/example-digest.warc
rename to test/data/example-digest-bad.warc
diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py
index 10914ce5..066b53fb 100644
--- a/test/test_archiveiterator.py
+++ b/test/test_archiveiterator.py
@@ -283,6 +283,8 @@ def test_err_arc_iterator_on_warc(self):
     def test_corrects_wget_bug(self):
         with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record:
             assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
+        with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record:
+            assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
 
     def test_corrects_space_in_target_uri(self):
         with self._find_first_by_type('example-space-in-target-uri.warc.gz', 'resource') as record:
@@ -345,9 +347,9 @@ def test_digests_file(self):
         expected_t = ['request', 'request', 'request']
 
         # record 1: invalid payload digest
-        assert self._load_archive('example-digest.warc', check_digests=True) == expected_t
-        assert self._load_archive('example-digest.warc', check_digests=False) == expected_f
+        assert self._load_archive('example-digest-bad.warc', check_digests=True) == expected_t
+        assert self._load_archive('example-digest-bad.warc', check_digests=False) == expected_f
 
         # record 2: b64 digest; record 3: b64 filename safe digest
-        assert self._load_archive('example-digest.warc', offset=922, check_digests=True) == expected_t
-        assert self._load_archive('example-digest.warc', offset=922, check_digests=False) == expected_t
+        assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=True) == expected_t
+        assert self._load_archive('example-digest-bad.warc', offset=922, check_digests=False) == expected_t
diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py
index 484b7f0f..24094936 100644
--- a/warcio/archiveiterator.py
+++ b/warcio/archiveiterator.py
@@ -56,12 +56,13 @@ class ArchiveIterator(six.Iterator):
     def __init__(self, fileobj, no_record_parse=False,
                  verify_http=False, arc2warc=False,
                  ensure_http_headers=False, block_size=BUFF_SIZE,
-                 check_digests=False):
+                 check_digests=False, fixup_bugs=True):
 
         self.fh = fileobj
 
         self.loader = ArcWarcRecordLoader(verify_http=verify_http,
-                                          arc2warc=arc2warc)
+                                          arc2warc=arc2warc,
+                                          fixup_bugs=fixup_bugs)
         self.known_format = None
 
         self.mixed_arc_warc = arc2warc
diff --git a/warcio/cli.py b/warcio/cli.py
index efdf7c50..ada44f12 100644
--- a/warcio/cli.py
+++ b/warcio/cli.py
@@ -4,6 +4,8 @@
 from warcio.checker import Checker
 from warcio.extractor import Extractor
 from warcio.recompressor import Recompressor
+from warcio.tester import Tester
+from warcio.utils import BUFF_SIZE
 
 import sys
 
@@ -51,6 +53,10 @@ def main(args=None):
     check.add_argument('-v', '--verbose', action='store_true')
     check.set_defaults(func=checker)
 
+    test = subparsers.add_parser('test', help='WARC standards tester')
+    test.add_argument('inputs', nargs='+')
+    test.set_defaults(func=tester)
+
     cmd = parser.parse_args(args=args)
     cmd.func(cmd)
 
@@ -86,6 +92,12 @@ def recompressor(cmd):
     _recompressor.recompress()
 
 
+# ============================================================================
+def tester(cmd):
+    _tester = Tester(cmd)
+    sys.exit(_tester.process_all())
+
+
 # ============================================================================
 if __name__ == "__main__":  #pragma: no cover
     main()
diff --git a/warcio/recordloader.py b/warcio/recordloader.py
index 05b159df..2f48233b 100644
--- a/warcio/recordloader.py
+++ b/warcio/recordloader.py
@@ -58,7 +58,7 @@ class ArcWarcRecordLoader(object):
     NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
     HTTP_SCHEMES = ('http:', 'https:')
 
-    def __init__(self, verify_http=True, arc2warc=True):
+    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True):
         if arc2warc:
             self.arc_parser = ARC2WARCHeadersParser()
         else:
@@ -68,6 +68,7 @@ def __init__(self, verify_http=True, arc2warc=True):
         self.http_parser = StatusAndHeadersParser(self.HTTP_TYPES, verify_http)
 
         self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
+        self.fixup_bugs = fixup_bugs
 
     def parse_record_stream(self, stream,
                             statusline=None,
@@ -99,7 +100,7 @@ def parse_record_stream(self, stream,
 
         elif the_format in ('warc', 'arc2warc'):
             rec_type = rec_headers.get_header('WARC-Type')
-            uri = self._ensure_target_uri_format(rec_headers)
+            uri = self._ensure_target_uri_format(rec_headers, fixup_bugs=self.fixup_bugs)
             length = rec_headers.get_header('Content-Length')
             content_type = rec_headers.get_header('Content-Type')
             if the_format == 'warc':
@@ -238,7 +239,7 @@ def _detect_type_load_headers(self, stream,
                 msg = 'Unknown archive format, first line: '
             raise ArchiveLoadFailed(msg + str(se.statusline))
 
-    def _ensure_target_uri_format(self, rec_headers):
+    def _ensure_target_uri_format(self, rec_headers, fixup_bugs=True):
         """Checks the value for the WARC-Target-URI header field to see if it starts
         with '<' and ends with '>' (Wget 1.19 bug) and if '<' and '>' are present,
         corrects and updates the field returning the corrected value for the field
@@ -251,8 +252,7 @@ def _ensure_target_uri_format(self, rec_headers):
         :rtype: str | None
         """
         uri = rec_headers.get_header('WARC-Target-URI')
-
-        if uri is not None and uri.startswith('<') and uri.endswith('>'):
+        if fixup_bugs and uri is not None and uri.startswith('<') and uri.endswith('>'):
             uri = uri[1:-1]
             rec_headers.replace_header('WARC-Target-URI', uri)
 
diff --git a/warcio/tester.py b/warcio/tester.py
new file mode 100644
index 00000000..800f797e
--- /dev/null
+++ b/warcio/tester.py
@@ -0,0 +1,638 @@
+from __future__ import print_function
+
+import re
+import ipaddress
+import sys
+import traceback
+
+from warcio.archiveiterator import WARCIterator
+
+
+class Commentary:
+    def __init__(self, record_id, rec_type):
+        self._record_id = record_id
+        self._rec_type = rec_type
+        self.errors = []
+        self.recommendations = []
+        self._comments = []
+
+    def record_id(self):
+        return self._record_id
+
+    def rec_type(self):
+        return self._rec_type
+
+    def error(self, *args):
+        self.errors.append(args)
+
+    def recommendation(self, *args):
+        self.recommendations.append(args)
+
+    def comment(self, *args):
+        self._comments.append(args)
+
+    def has_comments(self):
+        if self.errors or self.recommendations or self._comments:
+            return True
+
+    def comments(self):
+        for e in self.errors:
+            yield 'error: ' + ' '.join(e)
+        for r in self.recommendations:
+            yield 'recommendation: ' + ' '.join(r)
+        for c in self._comments:
+            yield 'comment: ' + ' '.join(c)
+
+
+class WrapRecord(object):
+    def __init__(self, obj):
+        self.obj = obj
+        self._content = None
+
+    def __getattr__(self, name):
+        if name == 'content':
+            if self._content is None:
+                self._content = self.obj.content_stream().read()
+            return self._content
+        return getattr(self.__dict__['obj'], name)
+
+
+def canon_content_type(s):
+    return s.lower().replace('; ', ';')
+
+
+def validate_warc_fields(record, commentary):
+    # warc-fields = *named-field CRLF
+    # named-field = field-name ":" [ field-value ]
+    # field-value = *( field-content | LWS )  # LWS signals continuations
+    # field-name = token  # token_re
+
+    content = record.content
+    try:
+        text = content.decode('utf-8', errors='strict')
+    except UnicodeDecodeError as e:
+        commentary.error('warc-fields contains invalid utf-8: '+str(e))
+        text = content.decode('utf-8', errors='replace')
+
+    first_line = True
+    lines = []
+    for line in text.splitlines(True):
+        if not line.endswith('\r\n'):
+            commentary.error('warc-fields lines must end with \r\n')
+            line = line.rstrip('\r\n')
+        else:
+            line = line[:-2]
+
+        if line.startswith(' ') or line.startswith('\t'):
+            if first_line:
+                commentary.error('The first line of warc-fields cannot start with whitespace')
+            else:
+                lines[-1] += ' ' + line[1:]
+        elif line == '':
+            # are blank lines prohibited?
+            pass
+        else:
+            # check for field-name :
+            if ':' not in line:
+                commentary.error('Missing field-name : in warc-fields line', line)
+            else:
+                field_name = line.split(':', 1)[0]
+                if not re.fullmatch(token_re, field_name):
+                    commentary('invalid warc-fields name', field_name)
+                else:
+                    lines.append(line)
+        first_line = False
+
+    # check known fields
+
+
+def validate_warcinfo(record, commentary, pending):
+    content_type = record.rec_headers.get_header('Content-Type')
+    if content_type.lower() != 'application/warc-fields':
+        commentary.recommencation('warcinfo Content-Type of application/warc-fields, saw', content_type)
+    else:
+        #   format: warc-fields
+        #   allowable fields include but not limited to DMCI plus the following
+        #   operator, software, robots, hostname, ip, http-header-user-agent, http-header-from
+        #     if operator present, recommended name or name and email address
+        #     comment if http-user-agent here and in the request or metadata record?
+        #     comment if http-header-from here and in the request?
+        validate_warc_fields(record, commentary)
+
+    # whole-file tests:
+    # optional that warcinfo be first in file, still deserves a comment
+    # allowable for warcinfo to appear anywhere
+
+
+def validate_response(record, commentary, pending):
+    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+
+    if target_uri.startswith('http:') or target_uri.startswith('https:'):
+        content_type = record.rec_headers.get_header('Content-Type')
+        if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}:
+            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw ', content_type)
+
+        if record.rec_headers.get_header('WARC-IP-Address') is None:
+            commentary.error('WARC-IP-Address should be used for http and https responses')
+
+        # error: http and https schemes should have http response headers
+        # comment: verify http content-length, if present -- commoncrawl nutch bug
+
+
+def validate_resource(record, commentary, pending):
+    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+
+    if target_uri.startswith('dns:'):
+        content_type = record.rec_headers.get_header('Content-Type')
+        if content_type.lower() != 'text/dns':
+            commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type)
+        else:
+            # rfc 2540 and rfc 1035
+            #validate_text_dns()
+            pass
+
+    # should never have http headers
+
+
+def validate_request(record, commentary, pending):
+    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+
+    if target_uri.startswith('http:') or target_uri.startswith('https:'):
+        content_type = record.rec_headers.get_header('Content-Type')
+
+        if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}:
+            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw ', content_type)
+
+        if record.rec_headers.get_header('WARC-IP-Address') is None:
+            commentary.error('WARC-IP-Address should be used for http and https requests')
+
+        # error: http and https schemes should have http request headers
+
+        # WARC-Concurrent-To field or fields may be used, comment if present but target record is not
+
+
+def validate_metadata(record, commentary, pending):
+    content_type = record.rec_headers.get_header('Content-Type')
+    if content_type.lower() == 'application/warc-fields':
+        # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6
+        # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it
+        # hopsFromSeed: string
+        # fetchTimeMs: time in milliseconds, so it's an integer?
+        validate_warc_fields(record, commentary)
+
+
+def validate_revisit(record, commentary, pending):
+    warc_profile = record.rec_headers.get_header('WARC-Profile')
+
+    if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'):
+        config = {
+            'required': ['WARC-Payload-Digest'],
+            'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],
+        }
+        validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
+        # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired
+        # recommended that server response headers be preserved "in this manner"
+
+    elif warc_profile.ends_with('/revisit/server-not-modified'):
+        config = {
+            'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'],
+            'prohibited': ['WARC-Payload-Digest'],
+        }
+        validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
+        #   may have content body; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated if desired
+        #   WARC-Refers-To-Date should be the same as WARC-Date in the original record if present
+    else:
+        commentary.comment('no revisit details validation done due to unknown profile')
+
+
+def validate_conversion(record, commentary, pending):
+    # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment?
+    # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To
+    pass
+
+
+def validate_continuation(record, commentary, pending):
+    commentary.comment('warcio test continuation code has not been tested, expect bugs')
+
+    warc_type = record.rec_headers.get_header('WARC-Type')
+    if warc_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
+        commentary.recommendation('do not segment warc-type', warc_type)
+
+    # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated
+
+
+def validate_actual_uri(field, value, record, version, commentary, pending):
+    # uri per RFC 3986
+    # should use a registered scheme
+    # %XX encoding, normalize to upper case
+    # schemes are case-insensitive and normalize to lower
+    if value.startswith('<') or value.endswith('>'):
+        # wget 1.19 bug caused by WARC 1.0 spec error
+        commentary.error('uri must not be within <>', field, value)
+    if ':' not in value:
+        commentary.error('invalid uri, no scheme', field, value)
+    if re.search(r'\s', value, re.A):
+        commentary.error('invalid uri, contains whitespace', field, value)
+    scheme, rest = value.split(':', 1)
+    if not re.fullmatch(r'[A-Za-z][A-Za-z0-9+\-\.]*', scheme, re.A):
+        commentary.error('invalid uri scheme, bad character', field, value)
+    # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+
+
+def validate_warc_type(field, value, record, version, commentary, pending):
+    if not value.islower():
+        # I am unclear if this is allowed? standard is silent
+        commentary.comment('Warc-Type is not lower-case', field, value)
+    if value.lower() not in record_types:
+        # standard says readers should ignore unknown warc-types
+        commentary.comment('unknown Warc-Type', field, value)
+
+
+def validate_uri(field, value, record, version, commentary, pending):
+    # < uri >
+    if not (value.startswith('<') and value.endswith('>')):
+        commentary.error('uri must be within <>', field, value)
+        return
+    validate_actual_uri(field, value[1:-1], record, version, commentary, pending)
+
+
+def validate_record_id(field, value, record, version, commentary, pending):
+    validate_uri(field, value, record, version, commentary, pending)
+    # TODO: should be "globally unique for its period of intended use"
+
+
+def validate_timestamp(field, value, record, version, commentary, pending):
+    use_ms = False if version == '1.0' else True
+    if not use_ms:
+        if '.' in value:
+            # XXX specification infelicity: would be nice to have 'advice to implementers' here
+            commentary.error('WARC 1.0 may not have fractional seconds', field, value)
+    else:
+        start, end = value.split('.', 1)
+        if not re.fullmatch(r'[0-9]{1,9}Z', end, re.A):
+            commentary.error('fractional seconds must have 1-9 digits', field, value)
+
+    # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
+
+    # TODO: "multiple records written as part of a single capture event shall use the same WARC-Date"
+    # how? follow WARC-Concurrent-To pointer(s) from request to response(s)
+
+
+def validate_content_length(field, value, record, version, commentary, pending):
+    if not value.isdigit():
+        commentary.error('must be an integer', field, value)
+
+
+token_re = r'[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+'
+digest_re = r'[A-Za-z0-9/+\-_=]+'
+
+
+def validate_content_type(field, value, record, version, commentary, pending):
+    if '/' not in value:
+        commentary.error('must contain a /', field, value)
+    ctype, rest = value.split('/', 1)
+    if not re.fullmatch(token_re, ctype, re.A):
+        commentary.error('invalid type', field, value)
+    if ';' in rest:
+        subtype, rest = rest.split(';', 1)
+    else:
+        subtype = rest
+    if not re.fullmatch(token_re, subtype, re.A):
+        commentary.error('invalid subtype', field, value)
+    # at this point there can be multiple parameters,
+    # some of which could have quoted string values with ; in them
+    # TODO: more checking
+
+
+def validate_digest(field, value, record, version, commentary, pending):
+    if ':' not in value:
+        commentary.error('missing algorithm', field, value)
+    algorithm, digest = value.split(':', 1)
+    if not re.fullmatch(token_re, algorithm, re.A):
+        commentary.error('invalid algorithm', field, value)
+    if not re.fullmatch(token_re, digest, re.A):
+        # https://github.com/iipc/warc-specifications/issues/48
+        # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
+        pass
+    if not re.fullmatch(digest_re, digest, re.A):
+        commentary.comment('Invalid-looking digest value', field, value)
+
+
+def validate_ip(field, value, record, version, commentary, pending):
+    # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291
+    try:
+        ipaddress.ip_address(value)
+    except ValueError:
+        commentary.error('invalid ip', field, value)
+
+
+def validate_truncated(field, value, record, version, commentary, pending):
+    if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}:
+        commentary.comment('extension seen', field, value)
+
+
+def validate_warcinfo_id(field, value, record, version, commentary, pending):
+    validate_uri(field, value, record, version, commentary, pending)
+    # TODO: should point at a warcinfo record
+
+
+def validate_filename(field, value, record, version, commentary, pending):
+    # TODO: text or quoted-string
+    pass
+
+
+profiles = {
+    '1.0': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
+            'http://netpreserve.org/warc/1.1/revisit/server-not-modified',
+            # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not?
+            'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'],
+    '1.1': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest',
+            'http://netpreserve.org/warc/1.0/revisit/server-not-modified'],
+}
+
+
+def validate_profile(field, value, record, version, commentary, pending):
+    if version not in profiles:
+        commentary.comment('no profile check because unknown warc version', field, value)
+        return
+    if value not in profiles[version]:
+        commentary.comment('extension seen', field, value)
+
+
+def validate_segment_number(field, value, record, version, commentary, pending):
+    if not value.isdigit():
+        commentary.error('must be an integer', field, value)
+    iv = int(value)
+    if iv == 0:
+        commentary.error('must be 1 or greater', field, value)
+    # TODO: type != continuation must have iv == 1, else iv > 1
+    # might make that check in the 'continuation' section?
+
+
+def validate_segment_total_length(field, value, record, version, commentary, pending):
+    if not value.isdigit():
+        commentary.error('must be an integer', field, value)
+
+
+warc_fields = {
+    'WARC-Type': {
+        'validate': validate_warc_type,
+    },
+    'WARC-Record-ID': {
+        'validate': validate_record_id,
+    },
+    'WARC-Date': {
+        'validate': validate_timestamp,
+    },
+    'Content-Length': {
+        'validate': validate_content_length,
+    },
+    'Content-Type': {
+        'validate': validate_content_type,
+    },
+    'WARC-Concurrent-To': {
+        'validate': validate_uri,
+    },
+    'WARC-Block-Digest': {
+        'validate': validate_digest,  # openssl check? or just let check_digest get it?
+    },
+    'WARC-Payload-Digest': {
+        'validate': validate_digest,
+    },
+    'WARC-IP-Address': {
+        'validate': validate_ip,
+    },
+    'WARC-Refers-To': {
+        'validate': validate_uri,
+    },
+    'WARC-Target-URI': {
+        'validate': validate_actual_uri,
+    },
+    'WARC-Truncated': {
+        'validate': validate_truncated,
+    },
+    'WARC-Warcinfo-ID': {
+        'validate': validate_warcinfo_id,
+    },
+    'WARC-Filename': {
+        'validate': validate_filename,
+    },
+    'WARC-Profile': {
+        'validate': validate_profile,
+    },
+    'WARC-Identified-Payload-Type': {
+        'validate': validate_content_type,
+    },
+    'WARC-Segment-Origin-ID': {
+        'validate': validate_uri,
+    },
+    'WARC-Segment-Number': {
+        'validate': validate_segment_number,
+    },
+    'WARC-Segment-Total-Length': {
+        'validate': validate_segment_total_length,
+    },
+    'WARC-Refers-To-Target-URI': {
+        'validate': validate_actual_uri,
+        'minver': '1.1',
+    },
+    'WARC-Refers-To-Date': {
+        'validate': validate_timestamp,
+        'minver': '1.1',
+    },
+}
+warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()])
+
+record_types = {
+    'warcinfo': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'],
+        'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'],
+        'validate': validate_warcinfo,
+    },
+    'response': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'Content-Type', 'WARC-Target-URI'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
+        'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_response,
+    },
+    'resource': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'],
+        'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+    },
+    'request': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'Content-Type', 'WARC-Target-URI'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
+        'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_request,
+    },
+    'metadata': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'Content-Type'],
+        'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated',
+                     'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'],
+        'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_metadata,
+    },
+    'revisit': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'Content-Type', 'WARC-Target-URI', 'WARC-Profile'],
+        'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID',  # normal optionals
+                     'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],  # these are for profiles
+        'prohibited': ['WARC-Filename'],
+        'validate': validate_revisit,
+    },
+    'conversion': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID'],
+        'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_conversion,
+    },
+    'continuation': {
+        'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
+                     'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'],
+        'optional': [],
+        'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_continuation,
+    },
+}
+
+
+def make_header_set(config, kinds):
+    ret = set()
+    for kind in kinds:
+        ret = ret.union(set([x.lower() for x in config.get(kind, [])]))
+    return ret
+
+
+def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False):
+    for req in config.get('required', []):
+        if not rec_headers.get_header(req):
+            commentary.error('missing required header', req)
+    for rec in config.get('recommended', []):
+        if not rec_headers.get_header(rec):
+            commentary.recommendation('missing recommended header', rec)
+    allowed = make_header_set(config, ('required', 'optional', 'recommended'))
+    prohibited = make_header_set(config, ('prohibited',))
+
+    for field, value in rec_headers.headers:
+        fl = field.lower()
+        if fl in prohibited:
+            commentary.error('field not allowed in record_type', field, rec_type)
+        elif allow_all or fl in allowed:
+            pass
+        elif fl in warc_fields:
+            commentary.comment('no configuration seen for', field, rec_type)
+        else:
+            # an 'unknown field' comment has already been issued in validate_record
+            pass
+
+
+def validate_record_against_rec_type(config, record, commentary, pending):
+    if 'validate' in config:
+        config['validate'](record, commentary, pending)
+
+
+def validate_record(record):
+    version = record.rec_headers.protocol.split('/', 1)[1]  # XXX not exported?
+
+    record_id = record.rec_headers.get_header('WARC-Record-ID')
+    rec_type = record.rec_headers.get_header('WARC-Type')
+    if record_id is None:
+        print('no WARC-Record-ID seen, skipping validation', file=sys.stderr)
+        return
+    commentary = Commentary(record_id, rec_type)
+    pending = None
+
+    seen_fields = set()
+    for field, value in record.rec_headers.headers:
+        field_case = field
+        field = field.lower()
+        if field != 'warc-concurrent-to' and field in seen_fields:
+            commentary.error('duplicate field seen', field, value)
+        if field not in warc_fields:
+            commentary.comment('unknown field, no validation performed', field_case, value)
+            continue
+        config = warc_fields[field]
+        if 'minver' in config:
+            if version < config['minver']:
+                # unknown fields are extensions, so this is a comment and not an error
+                commentary.comment('field was introduced after this warc version', field_case, value, version)
+        if 'validate' in config:
+            config['validate'](field, value, record, version, commentary, pending)
+
+    # TODO: validate warc types: unknown should get a comment
+    if rec_type not in record_types:
+        commentary.comment('unknown record type, no validation performed', rec_type)
+    else:
+        validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary)
+        validate_record_against_rec_type(record_types[rec_type], record, commentary, pending)
+
+    return commentary
+
+
+def _process_one(warc):
+    if warc.endswith('.arc') or warc.endswith('.arc.gz'):
+        return
+    with open(warc, 'rb') as stream:
+        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
+
+            try:
+                record = WrapRecord(record)
+                digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
+                                  record.rec_headers.get_header('WARC-Block-Digest'))
+
+                commentary = validate_record(record)
+
+                record.content  # make sure digests are checked
+                # XXX might need to read and digest the raw stream to check digests for chunked encoding?
+                # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
+            except Exception:
+                # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code
+                print('Caught exception in warcio test analysis code')
+                traceback.print_exc()
+                exit(1)
+
+            if commentary.has_comments() or record.digest_checker.passed is False:
+                print(' ', 'WARC-Record-ID', commentary.record_id())
+                print('   ', 'WARC-Type', commentary.rec_type())
+
+                if record.digest_checker.passed is True:
+                    print('    digest pass')
+                elif record.digest_checker.passed is None:
+                    if digest_present:
+                        print('    digest present but not checked')
+                    else:
+                        print('    digest not present')
+                for p in record.digest_checker.problems:
+                    print('   ', p)
+
+                if commentary.has_comments():
+                    for c in commentary.comments():
+                        print('   ', c)
+
+
+class Tester(object):
+    def __init__(self, cmd):
+        self.inputs = cmd.inputs
+        self.verbose = cmd.verbose
+        self.exit_value = 0
+
+    def process_all(self):
+        for warc in self.inputs:
+            print(warc)
+            try:
+                self.process_one(warc)
+            except Exception as e:
+                print('  saw exception '+str(e).rstrip(), file=sys.stderr)
+                print('  skipping rest of file', file=sys.stderr)
+        return self.exit_value
+
+    def process_one(self, filename):
+        _process_one(filename)

From 88dff09ee436b1922740d0600ce5e4d50693be4e Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 25 Jan 2019 16:12:21 -0800
Subject: [PATCH 36/68] documentation

---
 README.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.rst b/README.rst
index 9bc16420..ec16452d 100644
--- a/README.rst
+++ b/README.rst
@@ -368,6 +368,14 @@ of WARC records, if possible. An exit value of 1 indicates a failure.
 ``warcio check -v`` will print verbose output for each record in the
 WARC file.
 
+Test
+~~~~
+
+The ``warcio test`` command will check one or more WARC files against
+the WARC standard, giving commentary about standards violations,
+recommendations, and other issues.
+
+
 Recompress
 ~~~~~~~~~~
 

From c99bc2e409c46826d33a9b9111cd6a290ef78bb3 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 25 Jan 2019 16:42:58 -0800
Subject: [PATCH 37/68] tests

---
 test/test_archiveiterator.py | 2 +-
 test/test_cli.py             | 2 +-
 warcio/tester.py             | 5 +++--
 warcio/utils.py              | 6 +++---
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/test/test_archiveiterator.py b/test/test_archiveiterator.py
index 066b53fb..7378c7af 100644
--- a/test/test_archiveiterator.py
+++ b/test/test_archiveiterator.py
@@ -284,7 +284,7 @@ def test_corrects_wget_bug(self):
         with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response') as record:
             assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
         with self._find_first_by_type('example-wget-bad-target-uri.warc.gz', 'response', fixup_bugs=False) as record:
-            assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
+            assert record.rec_headers.get('WARC-Target-URI') == '<http://example.com/>'
 
     def test_corrects_space_in_target_uri(self):
         with self._find_first_by_type('example-space-in-target-uri.warc.gz', 'resource') as record:
diff --git a/test/test_cli.py b/test/test_cli.py
index 7bdc87f7..be82dab8 100644
--- a/test/test_cli.py
+++ b/test/test_cli.py
@@ -90,7 +90,7 @@ def test_check_valid():
 
 
 def test_check_invalid():
-    filenames = [get_test_file('example-digest.warc')]
+    filenames = [get_test_file('example-digest-bad.warc')]
 
     args = ['check'] + filenames
     value = check_helper(args, 1)
diff --git a/warcio/tester.py b/warcio/tester.py
index 800f797e..de456dc8 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -6,6 +6,7 @@
 import traceback
 
 from warcio.archiveiterator import WARCIterator
+from warcio.utils import to_native_str
 
 
 class Commentary:
@@ -69,10 +70,10 @@ def validate_warc_fields(record, commentary):
 
     content = record.content
     try:
-        text = content.decode('utf-8', errors='strict')
+        text = to_native_str(content, 'utf-8', errors='strict')
     except UnicodeDecodeError as e:
         commentary.error('warc-fields contains invalid utf-8: '+str(e))
-        text = content.decode('utf-8', errors='replace')
+        text = to_native_str(content, 'utf-8', errors='replace')
 
     first_line = True
     lines = []
diff --git a/warcio/utils.py b/warcio/utils.py
index 08783f06..fb544cff 100644
--- a/warcio/utils.py
+++ b/warcio/utils.py
@@ -13,14 +13,14 @@
 
 
 # #===========================================================================
-def to_native_str(value, encoding='utf-8'):
+def to_native_str(value, encoding='utf-8', errors='strict'):
     if isinstance(value, str):
         return value
 
     if six.PY3 and isinstance(value, six.binary_type):  #pragma: no cover
-        return value.decode(encoding)
+        return value.decode(encoding, errors)
     elif six.PY2 and isinstance(value, six.text_type):  #pragma: no cover
-        return value.encode(encoding)
+        return value.encode(encoding, errors)
     else:
         return value
 

From 003933534b54055aae8dcc977ebfabb70c2a5e0a Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 25 Jan 2019 17:03:04 -0800
Subject: [PATCH 38/68] tests

---
 warcio/tester.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index de456dc8..386586bb 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -1,7 +1,6 @@
 from __future__ import print_function
 
 import re
-import ipaddress
 import sys
 import traceback
 
@@ -9,6 +8,14 @@
 from warcio.utils import to_native_str
 
 
+def try_ipaddress_init():
+    # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies.
+    try:
+        import ipaddress
+    except ImportError:  # pragma: no cover
+        pass
+
+
 class Commentary:
     def __init__(self, record_id, rec_type):
         self._record_id = record_id
@@ -325,6 +332,8 @@ def validate_ip(field, value, record, version, commentary, pending):
         ipaddress.ip_address(value)
     except ValueError:
         commentary.error('invalid ip', field, value)
+    except NameError:
+        commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 
 def validate_truncated(field, value, record, version, commentary, pending):
@@ -622,8 +631,8 @@ def _process_one(warc):
 class Tester(object):
     def __init__(self, cmd):
         self.inputs = cmd.inputs
-        self.verbose = cmd.verbose
         self.exit_value = 0
+        try_ipaddress_init()
 
     def process_all(self):
         for warc in self.inputs:

From 9b7c9ce8af01d6e66fbe17b36e4bec043f746b3e Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 26 Jan 2019 08:39:22 -0800
Subject: [PATCH 39/68] coverage

---
 test/data/standard-torture-missing.warc       |   5 +
 .../standard-torture-validate-record.warc     |  79 ++++++++++
 test/test_tests.py                            | 149 ++++++++++++++++++
 warcio/tester.py                              |  79 ++++++----
 4 files changed, 278 insertions(+), 34 deletions(-)
 create mode 100644 test/data/standard-torture-missing.warc
 create mode 100644 test/data/standard-torture-validate-record.warc
 create mode 100644 test/test_tests.py

diff --git a/test/data/standard-torture-missing.warc b/test/data/standard-torture-missing.warc
new file mode 100644
index 00000000..a1ab0714
--- /dev/null
+++ b/test/data/standard-torture-missing.warc
@@ -0,0 +1,5 @@
+WARC/1.0
+WARC-Type: warcinfo
+Content-Length: 0
+
+
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
new file mode 100644
index 00000000..5181ea38
--- /dev/null
+++ b/test/data/standard-torture-validate-record.warc
@@ -0,0 +1,79 @@
+WARC/1.0
+WARC-Type: warcinfo
+Content-Type: application/warc-fields
+Content-Length: 146
+
+ first line can't start with a space
+test: invalid utf8 �(
+test: lines should end with \r\n
+foo:
+ bar
+
+no colon
+token cannot have a space:
+
+
+WARC/1.0
+WARC-Type: warcinfo
+Content-Type: application/warc-fields
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Target-URI: HtTp://example.com/
+Content-Type: text/plain
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: resource
+WARC-Target-URI: DnS:asdfasdf
+Content-Type: text/plain
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: resource
+WARC-Target-URI: DnS:asdfasdf
+Content-Type: text/dns
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: request
+WARC-Target-URI: hTtP://example.com/
+Content-Type: text/plain
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: metadata
+Content-Type: application/warc-fields
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: revisit
+WARC-Profile: none
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: revisit
+WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: revisit
+WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: continuation
+WARC-Segment-Number: 1
+Content-Length: 0
+
+
diff --git a/test/test_tests.py b/test/test_tests.py
new file mode 100644
index 00000000..239d2461
--- /dev/null
+++ b/test/test_tests.py
@@ -0,0 +1,149 @@
+from warcio.cli import main
+
+from . import get_test_file
+from .test_cli import patch_stdout
+
+
+def helper(args, expected_exit_value):
+    with patch_stdout() as buff:
+        exit_value = None
+        try:
+            main(args=args)
+        except SystemExit as e:
+            exit_value = e.code
+        finally:
+            assert exit_value == expected_exit_value
+
+        return buff.getvalue()
+
+
+def remove_before_test_data(s):
+    ret = b''
+    for line in s.splitlines(True):
+        if b'/test/data/' in line:
+            line = b'test/data/' + line.split(b'/test/data/', 1)[1]
+        ret += line
+    return ret
+
+
+def test_torture_missing():
+    files = ['standard-torture-missing.warc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = b"""\
+test/data/standard-torture-missing.warc
+  WARC-Record-ID None
+    WARC-Type warcinfo
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    recommendation: warcinfo Content-Type of application/warc-fields, saw none
+"""
+
+    value = helper(args, 0)
+    assert remove_before_test_data(value) == expected
+
+
+def test_torture_validate_record():
+    files = ['standard-torture-validate-record.warc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = b"""\
+test/data/standard-torture-validate-record.warc
+  WARC-Record-ID None
+    WARC-Type warcinfo
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
+    comment: The first line of warc-fields cannot start with whitespace
+    comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
+    comment: Missing field-name : in warc-fields line: no colon
+    comment: invalid warc-fields name: token cannot have a space
+  WARC-Record-ID None
+    WARC-Type warcinfo
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    comment: warc-fields body present but empty
+  WARC-Record-ID None
+    WARC-Type response
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain
+    error: WARC-IP-Address should be used for http and https responses
+  WARC-Record-ID None
+    WARC-Type resource
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+  WARC-Record-ID None
+    WARC-Type resource
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+  WARC-Record-ID None
+    WARC-Type request
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID None
+    WARC-Type metadata
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    comment: warc-fields body present but empty
+  WARC-Record-ID None
+    WARC-Type revisit
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
+    comment: extension seen warc-profile none
+    comment: no revisit details validation done due to unknown profile
+  WARC-Record-ID None
+    WARC-Type revisit
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
+    error: missing required header WARC-Payload-Digest
+    recommendation: missing recommended header WARC-Refers-To
+    recommendation: missing recommended header WARC-Refers-To-Date
+    recommendation: missing recommended header WARC-Refers-To-Target-URI
+  WARC-Record-ID None
+    WARC-Type revisit
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
+    recommendation: missing recommended header WARC-Refers-To
+    recommendation: missing recommended header WARC-Refers-To-Date
+    comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified
+  WARC-Record-ID None
+    WARC-Type continuation
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Segment-Origin-ID
+    error: missing required header WARC-Target-URI
+    error: continuation record must have WARC-Segment-Number > 1, saw 1
+    comment: warcio test continuation code has not been tested, expect bugs
+"""
+
+    value = helper(args, 0)
+    print(remove_before_test_data(value).decode())
+    assert remove_before_test_data(value) == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index 386586bb..bdfe38f0 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -75,10 +75,10 @@ def validate_warc_fields(record, commentary):
     # field-value = *( field-content | LWS )  # LWS signals continuations
     # field-name = token  # token_re
 
-    content = record.content
+    content = record.content  # TESTME
     try:
         text = to_native_str(content, 'utf-8', errors='strict')
-    except UnicodeDecodeError as e:
+    except UnicodeDecodeError as e:  # TESTME
         commentary.error('warc-fields contains invalid utf-8: '+str(e))
         text = to_native_str(content, 'utf-8', errors='replace')
 
@@ -86,14 +86,14 @@ def validate_warc_fields(record, commentary):
     lines = []
     for line in text.splitlines(True):
         if not line.endswith('\r\n'):
-            commentary.error('warc-fields lines must end with \r\n')
+            commentary.comment('warc-fields lines must end with \\r\\n:', line.rstrip())
             line = line.rstrip('\r\n')
         else:
             line = line[:-2]
 
         if line.startswith(' ') or line.startswith('\t'):
             if first_line:
-                commentary.error('The first line of warc-fields cannot start with whitespace')
+                commentary.comment('The first line of warc-fields cannot start with whitespace')
             else:
                 lines[-1] += ' ' + line[1:]
         elif line == '':
@@ -102,22 +102,26 @@ def validate_warc_fields(record, commentary):
         else:
             # check for field-name :
             if ':' not in line:
-                commentary.error('Missing field-name : in warc-fields line', line)
+                commentary.comment('Missing field-name : in warc-fields line:', line)
             else:
                 field_name = line.split(':', 1)[0]
                 if not re.fullmatch(token_re, field_name):
-                    commentary('invalid warc-fields name', field_name)
+                    commentary.comment('invalid warc-fields name:', field_name)
                 else:
                     lines.append(line)
         first_line = False
 
+    if not lines:
+        commentary.comment('warc-fields body present but empty')
+        return
+
     # check known fields
 
 
 def validate_warcinfo(record, commentary, pending):
-    content_type = record.rec_headers.get_header('Content-Type')
+    content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
-        commentary.recommencation('warcinfo Content-Type of application/warc-fields, saw', content_type)
+        commentary.recommendation('warcinfo Content-Type of application/warc-fields, saw', content_type)
     else:
         #   format: warc-fields
         #   allowable fields include but not limited to DMCI plus the following
@@ -133,25 +137,27 @@ def validate_warcinfo(record, commentary, pending):
 
 
 def validate_response(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()  # TESTME
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
-        content_type = record.rec_headers.get_header('Content-Type')
+        content_type = record.rec_headers.get_header('Content-Type', 'none')
         if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}:
-            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw ', content_type)
+            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
 
         # error: http and https schemes should have http response headers
+        #   test by attempting to parse them?
+
         # comment: verify http content-length, if present -- commoncrawl nutch bug
 
 
 def validate_resource(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower()  # TESTME
 
     if target_uri.startswith('dns:'):
-        content_type = record.rec_headers.get_header('Content-Type')
+        content_type = record.rec_headers.get_header('Content-Type', 'none')
         if content_type.lower() != 'text/dns':
             commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type)
         else:
@@ -163,13 +169,13 @@ def validate_resource(record, commentary, pending):
 
 
 def validate_request(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI').lower()
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()  # TESTME
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type')
 
         if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}:
-            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw ', content_type)
+            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https requests')
@@ -180,7 +186,7 @@ def validate_request(record, commentary, pending):
 
 
 def validate_metadata(record, commentary, pending):
-    content_type = record.rec_headers.get_header('Content-Type')
+    content_type = record.rec_headers.get_header('Content-Type', 'none')  # TESTME
     if content_type.lower() == 'application/warc-fields':
         # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6
         # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it
@@ -190,7 +196,7 @@ def validate_metadata(record, commentary, pending):
 
 
 def validate_revisit(record, commentary, pending):
-    warc_profile = record.rec_headers.get_header('WARC-Profile')
+    warc_profile = record.rec_headers.get_header('WARC-Profile', 'none')  # TESTME
 
     if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'):
         config = {
@@ -201,7 +207,7 @@ def validate_revisit(record, commentary, pending):
         # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired
         # recommended that server response headers be preserved "in this manner"
 
-    elif warc_profile.ends_with('/revisit/server-not-modified'):
+    elif warc_profile.endswith('/revisit/server-not-modified'):
         config = {
             'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Date'],
             'prohibited': ['WARC-Payload-Digest'],
@@ -216,15 +222,15 @@ def validate_revisit(record, commentary, pending):
 def validate_conversion(record, commentary, pending):
     # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment?
     # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To
-    pass
+    pass  # TESTME
 
 
 def validate_continuation(record, commentary, pending):
-    commentary.comment('warcio test continuation code has not been tested, expect bugs')
+    commentary.comment('warcio test continuation code has not been tested, expect bugs')  # TESTME
 
-    warc_type = record.rec_headers.get_header('WARC-Type')
-    if warc_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
-        commentary.recommendation('do not segment warc-type', warc_type)
+    segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none')
+    if segment_number.isdigit() and int(segment_number) < 2:
+        commentary.error('continuation record must have WARC-Segment-Number > 1, saw', segment_number)
 
     # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated
 
@@ -234,7 +240,7 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
     # should use a registered scheme
     # %XX encoding, normalize to upper case
     # schemes are case-insensitive and normalize to lower
-    if value.startswith('<') or value.endswith('>'):
+    if value.startswith('<') or value.endswith('>'):  # TESTME
         # wget 1.19 bug caused by WARC 1.0 spec error
         commentary.error('uri must not be within <>', field, value)
     if ':' not in value:
@@ -250,10 +256,10 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
 def validate_warc_type(field, value, record, version, commentary, pending):
     if not value.islower():
         # I am unclear if this is allowed? standard is silent
-        commentary.comment('Warc-Type is not lower-case', field, value)
+        commentary.comment('WARC-Type is not lower-case', field, value)
     if value.lower() not in record_types:
         # standard says readers should ignore unknown warc-types
-        commentary.comment('unknown Warc-Type', field, value)
+        commentary.comment('unknown WARC-Type', field, value)
 
 
 def validate_uri(field, value, record, version, commentary, pending):
@@ -307,8 +313,10 @@ def validate_content_type(field, value, record, version, commentary, pending):
         subtype = rest
     if not re.fullmatch(token_re, subtype, re.A):
         commentary.error('invalid subtype', field, value)
+
     # at this point there can be multiple parameters,
     # some of which could have quoted string values with ; in them
+
     # TODO: more checking
 
 
@@ -372,11 +380,17 @@ def validate_profile(field, value, record, version, commentary, pending):
 def validate_segment_number(field, value, record, version, commentary, pending):
     if not value.isdigit():
         commentary.error('must be an integer', field, value)
+        return
     iv = int(value)
     if iv == 0:
         commentary.error('must be 1 or greater', field, value)
-    # TODO: type != continuation must have iv == 1, else iv > 1
-    # might make that check in the 'continuation' section?
+
+    rec_type = record.rec_headers.get_header('WARC-Type', 'none')
+    if rec_type != 'continuation':
+        if iv != 1:
+            commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value)
+    elif rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
+        commentary.recommendation('do not segment warc-type', warc_type)
 
 
 def validate_segment_total_length(field, value, record, version, commentary, pending):
@@ -507,7 +521,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
     'continuation': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'],
-        'optional': [],
+        'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'],
         'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_continuation,
     },
@@ -522,10 +536,10 @@ def make_header_set(config, kinds):
 
 
 def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False):
-    for req in config.get('required', []):
+    for req in sorted(config.get('required', [])):
         if not rec_headers.get_header(req):
             commentary.error('missing required header', req)
-    for rec in config.get('recommended', []):
+    for rec in sorted(config.get('recommended', [])):
         if not rec_headers.get_header(rec):
             commentary.recommendation('missing recommended header', rec)
     allowed = make_header_set(config, ('required', 'optional', 'recommended'))
@@ -554,9 +568,6 @@ def validate_record(record):
 
     record_id = record.rec_headers.get_header('WARC-Record-ID')
     rec_type = record.rec_headers.get_header('WARC-Type')
-    if record_id is None:
-        print('no WARC-Record-ID seen, skipping validation', file=sys.stderr)
-        return
     commentary = Commentary(record_id, rec_type)
     pending = None
 

From 903ed1d9f0da0458dcfa9e2e055de7846b4bf13d Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 26 Jan 2019 08:46:10 -0800
Subject: [PATCH 40/68] python 2.7 test fix

---
 warcio/tester.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index bdfe38f0..b74a3b03 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -105,7 +105,7 @@ def validate_warc_fields(record, commentary):
                 commentary.comment('Missing field-name : in warc-fields line:', line)
             else:
                 field_name = line.split(':', 1)[0]
-                if not re.fullmatch(token_re, field_name):
+                if not re.search(token_re, field_name):
                     commentary.comment('invalid warc-fields name:', field_name)
                 else:
                     lines.append(line)
@@ -248,7 +248,7 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
     if re.search(r'\s', value, re.A):
         commentary.error('invalid uri, contains whitespace', field, value)
     scheme, rest = value.split(':', 1)
-    if not re.fullmatch(r'[A-Za-z][A-Za-z0-9+\-\.]*', scheme, re.A):
+    if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme, re.A):
         commentary.error('invalid uri scheme, bad character', field, value)
     # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
 
@@ -283,7 +283,7 @@ def validate_timestamp(field, value, record, version, commentary, pending):
             commentary.error('WARC 1.0 may not have fractional seconds', field, value)
     else:
         start, end = value.split('.', 1)
-        if not re.fullmatch(r'[0-9]{1,9}Z', end, re.A):
+        if not re.search(r'\A[0-9]{1,9}Z\Z', end, re.A):
             commentary.error('fractional seconds must have 1-9 digits', field, value)
 
     # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
@@ -297,21 +297,21 @@ def validate_content_length(field, value, record, version, commentary, pending):
         commentary.error('must be an integer', field, value)
 
 
-token_re = r'[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+'
-digest_re = r'[A-Za-z0-9/+\-_=]+'
+token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z'
+digest_re = r'\A[A-Za-z0-9/+\-_=]+\Z'
 
 
 def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
         commentary.error('must contain a /', field, value)
     ctype, rest = value.split('/', 1)
-    if not re.fullmatch(token_re, ctype, re.A):
+    if not re.search(token_re, ctype, re.A):
         commentary.error('invalid type', field, value)
     if ';' in rest:
         subtype, rest = rest.split(';', 1)
     else:
         subtype = rest
-    if not re.fullmatch(token_re, subtype, re.A):
+    if not re.search(token_re, subtype, re.A):
         commentary.error('invalid subtype', field, value)
 
     # at this point there can be multiple parameters,
@@ -324,13 +324,13 @@ def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
         commentary.error('missing algorithm', field, value)
     algorithm, digest = value.split(':', 1)
-    if not re.fullmatch(token_re, algorithm, re.A):
+    if not re.search(token_re, algorithm, re.A):
         commentary.error('invalid algorithm', field, value)
-    if not re.fullmatch(token_re, digest, re.A):
+    if not re.search(token_re, digest, re.A):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
         pass
-    if not re.fullmatch(digest_re, digest, re.A):
+    if not re.search(digest_re, digest, re.A):
         commentary.comment('Invalid-looking digest value', field, value)
 
 

From 68938bdce2de2180a204a600511e0a5242c5142a Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 26 Jan 2019 08:51:08 -0800
Subject: [PATCH 41/68] python 2.7 fixes

---
 warcio/tester.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index b74a3b03..c978a404 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -245,10 +245,10 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
         commentary.error('uri must not be within <>', field, value)
     if ':' not in value:
         commentary.error('invalid uri, no scheme', field, value)
-    if re.search(r'\s', value, re.A):
+    if re.search(r'\s', value):
         commentary.error('invalid uri, contains whitespace', field, value)
     scheme, rest = value.split(':', 1)
-    if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme, re.A):
+    if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
         commentary.error('invalid uri scheme, bad character', field, value)
     # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
 
@@ -283,7 +283,7 @@ def validate_timestamp(field, value, record, version, commentary, pending):
             commentary.error('WARC 1.0 may not have fractional seconds', field, value)
     else:
         start, end = value.split('.', 1)
-        if not re.search(r'\A[0-9]{1,9}Z\Z', end, re.A):
+        if not re.search(r'\A[0-9]{1,9}Z\Z', end):
             commentary.error('fractional seconds must have 1-9 digits', field, value)
 
     # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
@@ -305,13 +305,13 @@ def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
         commentary.error('must contain a /', field, value)
     ctype, rest = value.split('/', 1)
-    if not re.search(token_re, ctype, re.A):
+    if not re.search(token_re, ctype):
         commentary.error('invalid type', field, value)
     if ';' in rest:
         subtype, rest = rest.split(';', 1)
     else:
         subtype = rest
-    if not re.search(token_re, subtype, re.A):
+    if not re.search(token_re, subtype):
         commentary.error('invalid subtype', field, value)
 
     # at this point there can be multiple parameters,
@@ -324,13 +324,13 @@ def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
         commentary.error('missing algorithm', field, value)
     algorithm, digest = value.split(':', 1)
-    if not re.search(token_re, algorithm, re.A):
+    if not re.search(token_re, algorithm):
         commentary.error('invalid algorithm', field, value)
-    if not re.search(token_re, digest, re.A):
+    if not re.search(token_re, digest):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
         pass
-    if not re.search(digest_re, digest, re.A):
+    if not re.search(digest_re, digest):
         commentary.comment('Invalid-looking digest value', field, value)
 
 

From 234468a5f36176b134c71d47437865e7341e49d2 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 26 Jan 2019 11:08:05 -0800
Subject: [PATCH 42/68] coverage

---
 .../data/standard-torture-validate-field.warc |  52 ++++++++
 .../standard-torture-validate-record.warc     |   5 +
 test/test_tests.py                            | 123 +++++++++++++++++-
 warcio/tester.py                              |  73 +++++++----
 4 files changed, 219 insertions(+), 34 deletions(-)
 create mode 100644 test/data/standard-torture-validate-field.warc

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
new file mode 100644
index 00000000..2c28d72d
--- /dev/null
+++ b/test/data/standard-torture-validate-field.warc
@@ -0,0 +1,52 @@
+WARC/1.0
+WARC-Target-URI: <http://example.com/>
+WARC-Target-URI: example.com
+WARC-Target-URI: ex ample.com
+WARC-Target-URI: h<>ttp://example.com/
+WARC-Type: does-not-exist
+WARC-Type: CAPITALIZED
+WARC-Concurrent-To: http://example.com/
+WARC-Record-ID: <foo:bar>
+WARC-Date: 2017-03-06T04:03:53Z
+WARC-Date: 2017-03-06T04:03:53.Z
+Content-Type: asdf
+Content-Type: has space/asdf
+Content-Type: asdf/has space
+Content-Type: asdf/has space;asdf
+WARC-Block-Digest: asdf
+WARC-Block-Digest: has space:asdf
+WARC-Block-Digest: sha1:&$*^&*^#*&^
+WARC-IP-Address: 1.2.3.4.5
+WARC-Truncated: invalid
+WARC-Warcinfo-ID: asdf:asdf
+WARC-Filename: not-yet-tested
+WARC-Profile: asdf
+WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+WARC-Identified-Payload-Type: asdf
+WARC-Segment-Origin-ID: http://example.com
+WARC-Segment-Number: not-an-integer
+WARC-Segment-Number: 0
+WARC-Segment-Number: 1
+WARC-Segment-Number: 2
+WARC-Segment-Total-Length: 0
+WARC-Segment-Total-Length: not-an-integer
+WARC-Refers-To-Target-URI: http://example.com
+WARC-Refers-To-Date: not-a-date
+WARC-Unknown-Field: asdf
+Content-Length: 0
+
+
+WARC/1.1
+WARC-Date: 2017-03-06T04:03:53Z
+WARC-Date: 2017-03-06T04:03:53.Z
+WARC-Type: invalid
+Content-Length: 0
+
+
+WARC/1.1
+WARC-Type: request
+WARC-Segment-Number: 1
+Content-Length: 0
+
+
+WARC/invalid
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index 5181ea38..d212f370 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -71,6 +71,11 @@ WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: conversion
+Content-Length: 0
+
+
 WARC/1.0
 WARC-Type: continuation
 WARC-Segment-Number: 1
diff --git a/test/test_tests.py b/test/test_tests.py
index 239d2461..19b7e377 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -2,6 +2,7 @@
 
 from . import get_test_file
 from .test_cli import patch_stdout
+from warcio.utils import to_native_str
 
 
 def helper(args, expected_exit_value):
@@ -14,14 +15,14 @@ def helper(args, expected_exit_value):
         finally:
             assert exit_value == expected_exit_value
 
-        return buff.getvalue()
+        return to_native_str(buff.getvalue())
 
 
 def remove_before_test_data(s):
-    ret = b''
+    ret = ''
     for line in s.splitlines(True):
-        if b'/test/data/' in line:
-            line = b'test/data/' + line.split(b'/test/data/', 1)[1]
+        if '/test/data/' in line:
+            line = 'test/data/' + line.split('/test/data/', 1)[1]
         ret += line
     return ret
 
@@ -33,7 +34,7 @@ def test_torture_missing():
     args = ['test']
     args.extend(files)
 
-    expected = b"""\
+    expected = """\
 test/data/standard-torture-missing.warc
   WARC-Record-ID None
     WARC-Type warcinfo
@@ -55,7 +56,7 @@ def test_torture_validate_record():
     args = ['test']
     args.extend(files)
 
-    expected = b"""\
+    expected = """\
 test/data/standard-torture-validate-record.warc
   WARC-Record-ID None
     WARC-Type warcinfo
@@ -85,6 +86,7 @@ def test_torture_validate_record():
     digest not present
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
+    error: recource records for dns: shall have Content-Type of text/dns, saw text/plain
   WARC-Record-ID None
     WARC-Type resource
     digest not present
@@ -133,6 +135,12 @@ def test_torture_validate_record():
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
     comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified
+  WARC-Record-ID None
+    WARC-Type conversion
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
   WARC-Record-ID None
     WARC-Type continuation
     digest not present
@@ -145,5 +153,106 @@ def test_torture_validate_record():
 """
 
     value = helper(args, 0)
-    print(remove_before_test_data(value).decode())
+    print(remove_before_test_data(value))
+    assert remove_before_test_data(value) == expected
+
+
+def test_torture_validate_field():
+    files = ['standard-torture-validate-field.warc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = """\
+test/data/standard-torture-validate-field.warc
+  WARC-Record-ID <foo:bar>
+    WARC-Type does-not-exist
+    unknown hash algorithm name in block digest
+    error: uri must not be within <> warc-target-uri <http://example.com/>
+    error: invalid uri scheme, bad character warc-target-uri <http://example.com/>
+    error: duplicate field seen warc-target-uri example.com
+    error: invalid uri, no scheme warc-target-uri example.com
+    error: duplicate field seen warc-target-uri ex ample.com
+    error: invalid uri, no scheme warc-target-uri ex ample.com
+    error: invalid uri, contains whitespace warc-target-uri ex ample.com
+    error: invalid uri scheme, bad character warc-target-uri ex ample.com
+    error: duplicate field seen warc-target-uri h<>ttp://example.com/
+    error: invalid uri scheme, bad character warc-target-uri h<>ttp://example.com/
+    error: duplicate field seen warc-type CAPITALIZED
+    error: uri must be within <> warc-concurrent-to http://example.com/
+    error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
+    error: WARC 1.0 may not have fractional seconds warc-date 2017-03-06T04:03:53.Z
+    error: must contain a / content-type asdf
+    error: invalid subtype content-type asdf
+    error: duplicate field seen content-type has space/asdf
+    error: invalid type content-type has space/asdf
+    error: duplicate field seen content-type asdf/has space
+    error: invalid subtype content-type asdf/has space
+    error: duplicate field seen content-type asdf/has space;asdf
+    error: invalid subtype content-type asdf/has space;asdf
+    error: missing algorithm warc-block-digest asdf
+    error: duplicate field seen warc-block-digest has space:asdf
+    error: invalid algorithm warc-block-digest has space:asdf
+    error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^
+    error: uri must be within <> warc-warcinfo-id asdf:asdf
+    error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    error: must contain a / warc-identified-payload-type asdf
+    error: invalid subtype warc-identified-payload-type asdf
+    error: uri must be within <> warc-segment-origin-id http://example.com
+    error: must be an integer warc-segment-number not-an-integer
+    error: duplicate field seen warc-segment-number 0
+    error: must be 1 or greater warc-segment-number 0
+    error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 0
+    error: duplicate field seen warc-segment-number 1
+    error: duplicate field seen warc-segment-number 2
+    error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 2
+    error: duplicate field seen warc-segment-total-length not-an-integer
+    error: must be an integer warc-segment-total-length not-an-integer
+    comment: unknown WARC-Type warc-type does-not-exist
+    comment: WARC-Type is not lower-case warc-type CAPITALIZED
+    comment: unknown WARC-Type warc-type CAPITALIZED
+    comment: unknown digest algorithm warc-block-digest asdf
+    comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^
+    comment: did not check ip address format, install ipaddress module from pypi if you care
+    comment: extension seen warc-truncated invalid
+    comment: extension seen warc-profile asdf
+    comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0
+    comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0
+    comment: unknown field, no validation performed WARC-Unknown-Field asdf
+  WARC-Record-ID None
+    WARC-Type invalid
+    digest not present
+    error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
+    error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z
+    comment: unknown WARC-Type warc-type invalid
+  WARC-Record-ID None
+    WARC-Type request
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Target-URI
+    recommendation: do not segment WARC-Type request
+    comment: no configuration seen for WARC-Segment-Number request
+"""
+
+    value = helper(args, 0)
+    print(remove_before_test_data(value))
+    assert remove_before_test_data(value) == expected
+
+
+def test_arc():
+    files = ['does-not-exist.arc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = """\
+test/data/does-not-exist.arc
+"""
+
+    value = helper(args, 0)
     assert remove_before_test_data(value) == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index c978a404..4c2f8299 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -5,15 +5,15 @@
 import traceback
 
 from warcio.archiveiterator import WARCIterator
-from warcio.utils import to_native_str
+from warcio.utils import to_native_str, Digester
 
 
-def try_ipaddress_init():
+def try_ipaddress_import():
     # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies.
     try:
         import ipaddress
     except ImportError:  # pragma: no cover
-        pass
+        print('ipaddress module not imported')
 
 
 class Commentary:
@@ -75,10 +75,10 @@ def validate_warc_fields(record, commentary):
     # field-value = *( field-content | LWS )  # LWS signals continuations
     # field-name = token  # token_re
 
-    content = record.content  # TESTME
+    content = record.content
     try:
         text = to_native_str(content, 'utf-8', errors='strict')
-    except UnicodeDecodeError as e:  # TESTME
+    except UnicodeDecodeError as e:
         commentary.error('warc-fields contains invalid utf-8: '+str(e))
         text = to_native_str(content, 'utf-8', errors='replace')
 
@@ -137,7 +137,7 @@ def validate_warcinfo(record, commentary, pending):
 
 
 def validate_response(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()  # TESTME
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
@@ -154,7 +154,7 @@ def validate_response(record, commentary, pending):
 
 
 def validate_resource(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower()  # TESTME
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', '').lower()
 
     if target_uri.startswith('dns:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
@@ -169,7 +169,7 @@ def validate_resource(record, commentary, pending):
 
 
 def validate_request(record, commentary, pending):
-    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()  # TESTME
+    target_uri = record.rec_headers.get_header('WARC-Target-URI', 'none').lower()
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type')
@@ -186,7 +186,7 @@ def validate_request(record, commentary, pending):
 
 
 def validate_metadata(record, commentary, pending):
-    content_type = record.rec_headers.get_header('Content-Type', 'none')  # TESTME
+    content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() == 'application/warc-fields':
         # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6
         # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it
@@ -196,7 +196,7 @@ def validate_metadata(record, commentary, pending):
 
 
 def validate_revisit(record, commentary, pending):
-    warc_profile = record.rec_headers.get_header('WARC-Profile', 'none')  # TESTME
+    warc_profile = record.rec_headers.get_header('WARC-Profile', 'none')
 
     if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'):
         config = {
@@ -222,11 +222,11 @@ def validate_revisit(record, commentary, pending):
 def validate_conversion(record, commentary, pending):
     # where practical, have a warc-refers-to field -- not quite a recommendation, perhaps make it a comment?
     # suggests there should be a corresponding metadata record -- which may have a WARC-Refers-To
-    pass  # TESTME
+    pass
 
 
 def validate_continuation(record, commentary, pending):
-    commentary.comment('warcio test continuation code has not been tested, expect bugs')  # TESTME
+    commentary.comment('warcio test continuation code has not been tested, expect bugs')
 
     segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none')
     if segment_number.isdigit() and int(segment_number) < 2:
@@ -240,14 +240,14 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
     # should use a registered scheme
     # %XX encoding, normalize to upper case
     # schemes are case-insensitive and normalize to lower
-    if value.startswith('<') or value.endswith('>'):  # TESTME
+    if value.startswith('<') or value.endswith('>'):
         # wget 1.19 bug caused by WARC 1.0 spec error
         commentary.error('uri must not be within <>', field, value)
     if ':' not in value:
         commentary.error('invalid uri, no scheme', field, value)
     if re.search(r'\s', value):
         commentary.error('invalid uri, contains whitespace', field, value)
-    scheme, rest = value.split(':', 1)
+    scheme = value.split(':', 1)[0]
     if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
         commentary.error('invalid uri scheme, bad character', field, value)
     # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
@@ -282,9 +282,10 @@ def validate_timestamp(field, value, record, version, commentary, pending):
             # XXX specification infelicity: would be nice to have 'advice to implementers' here
             commentary.error('WARC 1.0 may not have fractional seconds', field, value)
     else:
-        start, end = value.split('.', 1)
-        if not re.search(r'\A[0-9]{1,9}Z\Z', end):
-            commentary.error('fractional seconds must have 1-9 digits', field, value)
+        if '.' in value:
+            start, end = value.split('.', 1)
+            if not re.search(r'\A[0-9]{1,9}Z\Z', end):
+                commentary.error('fractional seconds must have 1-9 digits', field, value)
 
     # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
 
@@ -304,7 +305,12 @@ def validate_content_length(field, value, record, version, commentary, pending):
 def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
         commentary.error('must contain a /', field, value)
-    ctype, rest = value.split('/', 1)
+    splits = value.split('/', 1)
+    ctype = splits[0]
+    if len(splits) > 1:
+        rest = splits[1]
+    else:
+        rest = ''
     if not re.search(token_re, ctype):
         commentary.error('invalid type', field, value)
     if ';' in rest:
@@ -323,9 +329,19 @@ def validate_content_type(field, value, record, version, commentary, pending):
 def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
         commentary.error('missing algorithm', field, value)
-    algorithm, digest = value.split(':', 1)
+    splits = value.split(':', 1)
+    algorithm = splits[0]
+    if len(splits) > 1:
+        digest = splits[1]
+    else:
+        digest = 'none'
     if not re.search(token_re, algorithm):
         commentary.error('invalid algorithm', field, value)
+    else:
+        try:
+            Digester(algorithm)
+        except ValueError:
+            commentary.comment('unknown digest algorithm', field, value)
     if not re.search(token_re, digest):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
@@ -389,8 +405,8 @@ def validate_segment_number(field, value, record, version, commentary, pending):
     if rec_type != 'continuation':
         if iv != 1:
             commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value)
-    elif rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
-        commentary.recommendation('do not segment warc-type', warc_type)
+    if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
+        commentary.recommendation('do not segment WARC-Type', rec_type)
 
 
 def validate_segment_total_length(field, value, record, version, commentary, pending):
@@ -418,7 +434,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_uri,
     },
     'WARC-Block-Digest': {
-        'validate': validate_digest,  # openssl check? or just let check_digest get it?
+        'validate': validate_digest,
     },
     'WARC-Payload-Digest': {
         'validate': validate_digest,
@@ -487,6 +503,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'validate': validate_resource,
     },
     'request': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
@@ -577,6 +594,7 @@ def validate_record(record):
         field = field.lower()
         if field != 'warc-concurrent-to' and field in seen_fields:
             commentary.error('duplicate field seen', field, value)
+        seen_fields.add(field)
         if field not in warc_fields:
             commentary.comment('unknown field, no validation performed', field_case, value)
             continue
@@ -588,9 +606,8 @@ def validate_record(record):
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 
-    # TODO: validate warc types: unknown should get a comment
     if rec_type not in record_types:
-        commentary.comment('unknown record type, no validation performed', rec_type)
+        pass  # we print a comment for this elsewhere
     else:
         validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary)
         validate_record_against_rec_type(record_types[rec_type], record, commentary, pending)
@@ -614,7 +631,7 @@ def _process_one(warc):
                 record.content  # make sure digests are checked
                 # XXX might need to read and digest the raw stream to check digests for chunked encoding?
                 # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
-            except Exception:
+            except Exception:  # pragma: no cover
                 # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code
                 print('Caught exception in warcio test analysis code')
                 traceback.print_exc()
@@ -643,7 +660,6 @@ class Tester(object):
     def __init__(self, cmd):
         self.inputs = cmd.inputs
         self.exit_value = 0
-        try_ipaddress_init()
 
     def process_all(self):
         for warc in self.inputs:
@@ -651,9 +667,12 @@ def process_all(self):
             try:
                 self.process_one(warc)
             except Exception as e:
-                print('  saw exception '+str(e).rstrip(), file=sys.stderr)
+                print('  saw exception '+repr(e).rstrip(), file=sys.stderr)
                 print('  skipping rest of file', file=sys.stderr)
         return self.exit_value
 
     def process_one(self, filename):
         _process_one(filename)
+
+
+try_ipaddress_import()

From e7f88e7183fac1dda31d8ad783d69f82da046e9c Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 27 Jan 2019 15:25:36 -0800
Subject: [PATCH 43/68] py2 testing

---
 test/test_tests.py | 23 ++++++++++++++++++++---
 warcio/tester.py   | 14 ++------------
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index 19b7e377..a197c3ba 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -1,8 +1,10 @@
+import six
+
 from warcio.cli import main
+from warcio.utils import to_native_str
 
 from . import get_test_file
 from .test_cli import patch_stdout
-from warcio.utils import to_native_str
 
 
 def helper(args, expected_exit_value):
@@ -154,7 +156,13 @@ def test_torture_validate_record():
 
     value = helper(args, 0)
     print(remove_before_test_data(value))
-    assert remove_before_test_data(value) == expected
+
+    ret = remove_before_test_data(value)
+
+    if six.PY2:
+        expected = expected.replace('\n    error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n')
+
+    assert ret == expected
 
 
 def test_torture_validate_field():
@@ -195,6 +203,7 @@ def test_torture_validate_field():
     error: duplicate field seen warc-block-digest has space:asdf
     error: invalid algorithm warc-block-digest has space:asdf
     error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^
+    error: invalid ip warc-ip-address 1.2.3.4.5
     error: uri must be within <> warc-warcinfo-id asdf:asdf
     error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
     error: must contain a / warc-identified-payload-type asdf
@@ -214,7 +223,6 @@ def test_torture_validate_field():
     comment: unknown WARC-Type warc-type CAPITALIZED
     comment: unknown digest algorithm warc-block-digest asdf
     comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^
-    comment: did not check ip address format, install ipaddress module from pypi if you care
     comment: extension seen warc-truncated invalid
     comment: extension seen warc-profile asdf
     comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
@@ -240,6 +248,15 @@ def test_torture_validate_field():
 
     value = helper(args, 0)
     print(remove_before_test_data(value))
+
+    ret = remove_before_test_data(value)
+    if six.PY2:
+        if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret:
+            # user did not install ipaddress module
+            expected = expected.replace('\n    error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n')
+            ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
+
+
     assert remove_before_test_data(value) == expected
 
 
diff --git a/warcio/tester.py b/warcio/tester.py
index 4c2f8299..308f35fd 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -8,14 +8,6 @@
 from warcio.utils import to_native_str, Digester
 
 
-def try_ipaddress_import():
-    # ipaddress is in 3.3+ but not 2.7. It is in pypi but we wish to limit dependencies.
-    try:
-        import ipaddress
-    except ImportError:  # pragma: no cover
-        print('ipaddress module not imported')
-
-
 class Commentary:
     def __init__(self, record_id, rec_type):
         self._record_id = record_id
@@ -353,10 +345,11 @@ def validate_digest(field, value, record, version, commentary, pending):
 def validate_ip(field, value, record, version, commentary, pending):
     # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291
     try:
+        import ipaddress
         ipaddress.ip_address(value)
     except ValueError:
         commentary.error('invalid ip', field, value)
-    except NameError:
+    except (ImportError, NameError):
         commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 
@@ -673,6 +666,3 @@ def process_all(self):
 
     def process_one(self, filename):
         _process_one(filename)
-
-
-try_ipaddress_import()

From 86620731b78187270b8d77afb378a932ed6d3843 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 27 Jan 2019 15:35:30 -0800
Subject: [PATCH 44/68] py2 windows testing

---
 test/test_tests.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index a197c3ba..01e72ef4 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -25,6 +25,8 @@ def remove_before_test_data(s):
     for line in s.splitlines(True):
         if '/test/data/' in line:
             line = 'test/data/' + line.split('/test/data/', 1)[1]
+        if '\\test\\data\\' in line:
+            line = 'test/data/' + line.split('\\test\\data\\', 1)[1]
         ret += line
     return ret
 
@@ -247,17 +249,16 @@ def test_torture_validate_field():
 """
 
     value = helper(args, 0)
-    print(remove_before_test_data(value))
-
     ret = remove_before_test_data(value)
+
     if six.PY2:
         if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret:
             # user did not install ipaddress module
             expected = expected.replace('\n    error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n')
             ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
 
-
-    assert remove_before_test_data(value) == expected
+    print(ret)
+    assert ret == expected
 
 
 def test_arc():

From 291460e1970f61173109567ff00b1fe8d3452081 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 27 Jan 2019 23:11:46 -0800
Subject: [PATCH 45/68] coverage

---
 .../standard-torture-validate-record.warc     |  1 +
 test/test_tests.py                            | 55 ++++++++++++++++++-
 warcio/tester.py                              | 52 +++++++++---------
 3 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index d212f370..08a39e50 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -1,6 +1,7 @@
 WARC/1.0
 WARC-Type: warcinfo
 Content-Type: application/warc-fields
+WARC-Refers-To: probhibited
 Content-Length: 146
 
  first line can't start with a space
diff --git a/test/test_tests.py b/test/test_tests.py
index 01e72ef4..0fdecc74 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -2,6 +2,7 @@
 
 from warcio.cli import main
 from warcio.utils import to_native_str
+import warcio.tester
 
 from . import get_test_file
 from .test_cli import patch_stdout
@@ -65,8 +66,10 @@ def test_torture_validate_record():
   WARC-Record-ID None
     WARC-Type warcinfo
     digest not present
+    error: uri must be within <> warc-refers-to probhibited
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
+    error: field not allowed in record_type WARC-Refers-To warcinfo
     error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
     comment: The first line of warc-fields cannot start with whitespace
     comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
@@ -129,6 +132,7 @@ def test_torture_validate_record():
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
     recommendation: missing recommended header WARC-Refers-To-Target-URI
+    comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
   WARC-Record-ID None
     WARC-Type revisit
     digest not present
@@ -138,7 +142,6 @@ def test_torture_validate_record():
     error: missing required header WARC-Target-URI
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
-    comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/server-not-modified
   WARC-Record-ID None
     WARC-Type conversion
     digest not present
@@ -227,7 +230,6 @@ def test_torture_validate_field():
     comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^
     comment: extension seen warc-truncated invalid
     comment: extension seen warc-profile asdf
-    comment: extension seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
     comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0
     comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0
     comment: unknown field, no validation performed WARC-Unknown-Field asdf
@@ -274,3 +276,52 @@ def test_arc():
 
     value = helper(args, 0)
     assert remove_before_test_data(value) == expected
+
+
+def test_digests():
+    # needed for test coverage
+    files = ['example-digest-bad.warc']
+    files = [get_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = """\
+test/data/example-digest-bad.warc
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+"""
+
+    value = helper(args, 0)
+    assert remove_before_test_data(value) == expected
+
+
+def test_leftovers():
+    commentary = warcio.tester.Commentary('id', 'type')
+
+    # hard to test because invalid WARC Content-Length raises in archiveiterator
+    warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None)
+
+    # hard to test because warcio checks the WARC version
+    warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
+
+    expected = '''\
+error: must be an integer content-length not-an-integer
+comment: no profile check because unknown warc version blah blah
+'''
+
+    assert '\n'.join(commentary.comments())+'\n' == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index 308f35fd..de9f3ca1 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -2,10 +2,10 @@
 
 import re
 import sys
-import traceback
 
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
+from warcio.exceptions import ArchiveLoadFailed
 
 
 class Commentary:
@@ -196,8 +196,11 @@ def validate_revisit(record, commentary, pending):
             'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],
         }
         validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
-        # may have record block; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated FOR LENGTH ONLY if desired
-        # recommended that server response headers be preserved "in this manner"
+        # may have record block;
+        #  if not, shall have Content-Length: 0,
+        #  if yes, should be like a response record, truncated FOR LENGTH ONLY if desired
+        #  recommended that server response headers be preserved "in this manner"
+        #   I suppose that means headers are required if there is any content?!
 
     elif warc_profile.endswith('/revisit/server-not-modified'):
         config = {
@@ -205,7 +208,9 @@ def validate_revisit(record, commentary, pending):
             'prohibited': ['WARC-Payload-Digest'],
         }
         validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
-        #   may have content body; if not, shall have Content-Length: 0, if yes, should be like a response record, truncated if desired
+        #   may have content body;
+        #     if not, shall have Content-Length: 0,
+        #     if yes, should be like a response record, truncated if desired
         #   WARC-Refers-To-Date should be the same as WARC-Date in the original record if present
     else:
         commentary.comment('no revisit details validation done due to unknown profile')
@@ -343,13 +348,12 @@ def validate_digest(field, value, record, version, commentary, pending):
 
 
 def validate_ip(field, value, record, version, commentary, pending):
-    # ipv4 as dotted quad, or ipv6 per section 2.2 of rfc 4291
     try:
         import ipaddress
         ipaddress.ip_address(value)
     except ValueError:
         commentary.error('invalid ip', field, value)
-    except (ImportError, NameError):
+    except (ImportError, NameError):  # pragma: no cover (for python 2.7)
         commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 
@@ -369,12 +373,14 @@ def validate_filename(field, value, record, version, commentary, pending):
 
 
 profiles = {
-    '1.0': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
-            'http://netpreserve.org/warc/1.1/revisit/server-not-modified',
+    # XXX WARC/0.17 and WARC/0.18
+    '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest',
+            'http://netpreserve.org/warc/1.0/revisit/server-not-modified',
             # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not?
+            # https://github.com/iipc/webarchive-commons/commits/988bec707c27a01333becfc3bd502af4441ea1e1/src/main/java/org/archive/format/warc/WARCConstants.java
             'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'],
-    '1.1': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest',
-            'http://netpreserve.org/warc/1.0/revisit/server-not-modified'],
+    '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
+            'http://netpreserve.org/warc/1.1/revisit/server-not-modified'],
 }
 
 
@@ -614,21 +620,15 @@ def _process_one(warc):
     with open(warc, 'rb') as stream:
         for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
 
-            try:
-                record = WrapRecord(record)
-                digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
-                                  record.rec_headers.get_header('WARC-Block-Digest'))
+            record = WrapRecord(record)
+            digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
+                              record.rec_headers.get_header('WARC-Block-Digest'))
 
-                commentary = validate_record(record)
+            commentary = validate_record(record)
 
-                record.content  # make sure digests are checked
-                # XXX might need to read and digest the raw stream to check digests for chunked encoding?
-                # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
-            except Exception:  # pragma: no cover
-                # because of the top-level try: to catch exceptions in WARCIterator, this is needed to debug our code
-                print('Caught exception in warcio test analysis code')
-                traceback.print_exc()
-                exit(1)
+            record.content  # make sure digests are checked
+            # XXX might need to read and digest the raw stream to check digests for chunked encoding?
+            # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
 
             if commentary.has_comments() or record.digest_checker.passed is False:
                 print(' ', 'WARC-Record-ID', commentary.record_id())
@@ -637,7 +637,7 @@ def _process_one(warc):
                 if record.digest_checker.passed is True:
                     print('    digest pass')
                 elif record.digest_checker.passed is None:
-                    if digest_present:
+                    if digest_present:  # pragma: no cover
                         print('    digest present but not checked')
                     else:
                         print('    digest not present')
@@ -659,8 +659,8 @@ def process_all(self):
             print(warc)
             try:
                 self.process_one(warc)
-            except Exception as e:
-                print('  saw exception '+repr(e).rstrip(), file=sys.stderr)
+            except ArchiveLoadFailed as e:
+                print('  saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr)
                 print('  skipping rest of file', file=sys.stderr)
         return self.exit_value
 

From 69080d51a10aa6ad2f592447955cd858b0f9fe10 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 27 Jan 2019 23:57:53 -0800
Subject: [PATCH 46/68] branch coverage

---
 .../data/standard-torture-validate-field.warc |  1 +
 .../standard-torture-validate-record.warc     | 26 +++++++++++
 test/test_tests.py                            | 44 ++++++++++++++++++-
 warcio/tester.py                              |  1 +
 4 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
index 2c28d72d..c88d3ee6 100644
--- a/test/data/standard-torture-validate-field.warc
+++ b/test/data/standard-torture-validate-field.warc
@@ -39,6 +39,7 @@ Content-Length: 0
 WARC/1.1
 WARC-Date: 2017-03-06T04:03:53Z
 WARC-Date: 2017-03-06T04:03:53.Z
+WARC-Date: 2017-03-06T04:03:53.0Z
 WARC-Type: invalid
 Content-Length: 0
 
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index 08a39e50..6f06205e 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -41,9 +41,23 @@ Content-Type: text/dns
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: resource
+WARC-Target-URI: foo:bar
+Content-Length: 0
+
+
+WARC/1.0
+WARC-Type: request
+WARC-Target-URI: hTtP://example.com/
+Content-Type: text/plain
+Content-Length: 0
+
+
 WARC/1.0
 WARC-Type: request
 WARC-Target-URI: hTtP://example.com/
+WARC-IP-Address: 1.2.3.4
 Content-Type: text/plain
 Content-Length: 0
 
@@ -54,6 +68,12 @@ Content-Type: application/warc-fields
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: metadata
+Content-Type: not-application/warc-fields
+Content-Length: 0
+
+
 WARC/1.0
 WARC-Type: revisit
 WARC-Profile: none
@@ -83,3 +103,9 @@ WARC-Segment-Number: 1
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: continuation
+WARC-Segment-Number: 2
+Content-Length: 0
+
+
diff --git a/test/test_tests.py b/test/test_tests.py
index 0fdecc74..174466c8 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -99,6 +99,12 @@ def test_torture_validate_record():
     digest not present
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
+  WARC-Record-ID None
+    WARC-Type resource
+    digest not present
+    error: missing required header Content-Type
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
   WARC-Record-ID None
     WARC-Type request
     digest not present
@@ -106,12 +112,23 @@ def test_torture_validate_record():
     error: missing required header WARC-Record-ID
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID None
+    WARC-Type request
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
   WARC-Record-ID None
     WARC-Type metadata
     digest not present
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
     comment: warc-fields body present but empty
+  WARC-Record-ID None
+    WARC-Type metadata
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
   WARC-Record-ID None
     WARC-Type revisit
     digest not present
@@ -157,6 +174,14 @@ def test_torture_validate_record():
     error: missing required header WARC-Target-URI
     error: continuation record must have WARC-Segment-Number > 1, saw 1
     comment: warcio test continuation code has not been tested, expect bugs
+  WARC-Record-ID None
+    WARC-Type continuation
+    digest not present
+    error: missing required header WARC-Date
+    error: missing required header WARC-Record-ID
+    error: missing required header WARC-Segment-Origin-ID
+    error: missing required header WARC-Target-URI
+    comment: warcio test continuation code has not been tested, expect bugs
 """
 
     value = helper(args, 0)
@@ -238,6 +263,7 @@ def test_torture_validate_field():
     digest not present
     error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
     error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z
+    error: duplicate field seen warc-date 2017-03-06T04:03:53.0Z
     comment: unknown WARC-Type warc-type invalid
   WARC-Record-ID None
     WARC-Type request
@@ -280,7 +306,7 @@ def test_arc():
 
 def test_digests():
     # needed for test coverage
-    files = ['example-digest-bad.warc']
+    files = ['example-digest-bad.warc', 'example.warc']
     files = [get_test_file(filename) for filename in files]
 
     args = ['test']
@@ -304,6 +330,21 @@ def test_digests():
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
+test/data/example.warc
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest not present
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
+    WARC-Type revisit
+    digest present but not checked
+    recommendation: missing recommended header WARC-Refers-To
+    comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com/ 1.0
+    comment: field was introduced after this warc version WARC-Refers-To-Date 2017-03-06T04:02:06Z 1.0
+  WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest not present
+    error: WARC-IP-Address should be used for http and https requests
 """
 
     value = helper(args, 0)
@@ -312,6 +353,7 @@ def test_digests():
 
 def test_leftovers():
     commentary = warcio.tester.Commentary('id', 'type')
+    assert not commentary.has_comments()
 
     # hard to test because invalid WARC Content-Length raises in archiveiterator
     warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None)
diff --git a/warcio/tester.py b/warcio/tester.py
index de9f3ca1..eaf7f09f 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -638,6 +638,7 @@ def _process_one(warc):
                     print('    digest pass')
                 elif record.digest_checker.passed is None:
                     if digest_present:  # pragma: no cover
+                        # WARC record missing Content-Length: header, which is verboten
                         print('    digest present but not checked')
                     else:
                         print('    digest not present')

From 2e1d82012ad958f96ad30440d044207c9c187634 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 00:05:31 -0800
Subject: [PATCH 47/68] py2 branch coverage

---
 test/test_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_tests.py b/test/test_tests.py
index 174466c8..98517308 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -191,6 +191,7 @@ def test_torture_validate_record():
 
     if six.PY2:
         expected = expected.replace('\n    error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n')
+        ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
 
     assert ret == expected
 

From bbdb57b4d37900ec0220f251860a06ead96093c1 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 10:09:17 -0800
Subject: [PATCH 48/68] py2 testing

---
 setup.py           | 17 +++++++++++------
 test/test_tests.py | 20 +++++---------------
 warcio/tester.py   | 26 ++++++++++++++++++++------
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/setup.py b/setup.py
index 0203bb64..f0390160 100755
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,7 @@
 from setuptools import setup, find_packages
 from setuptools.command.test import test as TestCommand
 import glob
+import sys
 
 __version__ = '1.7.1'
 
@@ -21,6 +22,15 @@ def run_tests(self):
         errcode = pytest.main(['--doctest-modules', './warcio', '--cov', 'warcio', '-v', 'test/'])
         sys.exit(errcode)
 
+tests_require = [
+    'pytest',
+    'pytest-cov',
+    'httpbin==0.5.0',
+    'requests',
+]
+if sys.version_info < (3, 3):
+    tests_require.append('ipaddress')
+
 setup(
     name='warcio',
     version=__version__,
@@ -44,12 +54,7 @@ def run_tests(self):
     """,
     cmdclass={'test': PyTest},
     test_suite='',
-    tests_require=[
-        'pytest',
-        'pytest-cov',
-        'httpbin==0.5.0',
-        'requests',
-    ],
+    tests_require=tests_require,
     classifiers=[
         'Development Status :: 5 - Production/Stable',
         'Environment :: Web Environment',
diff --git a/test/test_tests.py b/test/test_tests.py
index 98517308..dab1e669 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -187,13 +187,9 @@ def test_torture_validate_record():
     value = helper(args, 0)
     print(remove_before_test_data(value))
 
-    ret = remove_before_test_data(value)
+    actual = remove_before_test_data(value)
 
-    if six.PY2:
-        expected = expected.replace('\n    error: warc-fields contains invalid utf-8: \'utf-8\' codec can\'t decode byte 0xc3 in position 57: invalid continuation byte\n', '\n')
-        ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
-
-    assert ret == expected
+    assert actual == expected
 
 
 def test_torture_validate_field():
@@ -278,16 +274,10 @@ def test_torture_validate_field():
 """
 
     value = helper(args, 0)
-    ret = remove_before_test_data(value)
-
-    if six.PY2:
-        if 'error: invalid ip warc-ip-address 1.2.3.4.5' not in ret:
-            # user did not install ipaddress module
-            expected = expected.replace('\n    error: invalid ip warc-ip-address 1.2.3.4.5\n', '\n')
-            ret = ret.replace('\n    comment: did not check ip address format, install ipaddress module from pypi if you care\n', '\n')
+    actual = remove_before_test_data(value)
 
-    print(ret)
-    assert ret == expected
+    print(actual)
+    assert actual == expected
 
 
 def test_arc():
diff --git a/warcio/tester.py b/warcio/tester.py
index eaf7f09f..f00479ff 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -2,6 +2,7 @@
 
 import re
 import sys
+import six
 
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
@@ -68,11 +69,22 @@ def validate_warc_fields(record, commentary):
     # field-name = token  # token_re
 
     content = record.content
-    try:
-        text = to_native_str(content, 'utf-8', errors='strict')
-    except UnicodeDecodeError as e:
-        commentary.error('warc-fields contains invalid utf-8: '+str(e))
-        text = to_native_str(content, 'utf-8', errors='replace')
+
+    if six.PY2:  # pragma: no cover
+        try:
+            content.decode('utf-8', errors='strict')
+            text = content  # already a str
+        except UnicodeDecodeError as e:
+            err = str(e)
+            err = err.replace('utf8', 'utf-8')  # sigh
+            commentary.error('warc-fields contains invalid utf-8: '+err)
+            text = content.decode('utf-8', errors='replace')
+    else:  # pragma: no cover
+        try:
+            text = to_native_str(content, 'utf-8', errors='strict')
+        except UnicodeDecodeError as e:
+            commentary.error('warc-fields contains invalid utf-8: '+str(e))
+            text = to_native_str(content, 'utf-8', errors='replace')
 
     first_line = True
     lines = []
@@ -350,10 +362,12 @@ def validate_digest(field, value, record, version, commentary, pending):
 def validate_ip(field, value, record, version, commentary, pending):
     try:
         import ipaddress
+        if six.PY2:  # pragma: no cover
+            value = unicode(value)
         ipaddress.ip_address(value)
     except ValueError:
         commentary.error('invalid ip', field, value)
-    except (ImportError, NameError):  # pragma: no cover (for python 2.7)
+    except (ImportError, NameError):  # pragma: no cover
         commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 

From fc2d7b42549deada63ad1e65e81214a90fa75301 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 11:15:57 -0800
Subject: [PATCH 49/68] add record ids to test

---
 test/data/standard-torture-missing.warc       |  5 -
 .../standard-torture-validate-record.warc     | 25 +++++
 test/test_tests.py                            | 91 ++++++++-----------
 warcio/tester.py                              |  2 +-
 4 files changed, 63 insertions(+), 60 deletions(-)
 delete mode 100644 test/data/standard-torture-missing.warc

diff --git a/test/data/standard-torture-missing.warc b/test/data/standard-torture-missing.warc
deleted file mode 100644
index a1ab0714..00000000
--- a/test/data/standard-torture-missing.warc
+++ /dev/null
@@ -1,5 +0,0 @@
-WARC/1.0
-WARC-Type: warcinfo
-Content-Length: 0
-
-
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index 6f06205e..fa03b38e 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -15,13 +15,24 @@ token cannot have a space:
 
 
 WARC/1.0
+WARC-Record-ID: test-empty-warc-fields
 WARC-Type: warcinfo
 Content-Type: application/warc-fields
 Content-Length: 0
 
 
+WARC/1.0
+WARC-Type: warcinfo
+WARC-Record-ID: test-warcinfo-non-recommended-content-type
+Content-Type: not-application/warc-fields
+Content-Length: 5
+
+foo
+
+
 WARC/1.0
 WARC-Type: response
+WARC-Record-ID: test-response-content-type
 WARC-Target-URI: HtTp://example.com/
 Content-Type: text/plain
 Content-Length: 0
@@ -29,6 +40,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
+WARC-Record-ID: test-resource-dns-content-type
 WARC-Target-URI: DnS:asdfasdf
 Content-Type: text/plain
 Content-Length: 0
@@ -36,6 +48,8 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
+WARC-Record-ID: test-resource-dns-empty
+WARC-Test-TODO: add another with valid block
 WARC-Target-URI: DnS:asdfasdf
 Content-Type: text/dns
 Content-Length: 0
@@ -43,12 +57,14 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
+WARC-Record-ID: test-resource-not-dns
 WARC-Target-URI: foo:bar
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: request
+WARC-Record-ID: test-request-unrecommended-content-type
 WARC-Target-URI: hTtP://example.com/
 Content-Type: text/plain
 Content-Length: 0
@@ -56,6 +72,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: request
+WARC-Record-ID: test-request-unrecommended-content-type-with-ip
 WARC-Target-URI: hTtP://example.com/
 WARC-IP-Address: 1.2.3.4
 Content-Type: text/plain
@@ -64,47 +81,55 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: metadata
+WARC-Record-ID: test-metadata-warc-fields-empty
 Content-Type: application/warc-fields
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: metadata
+WARC-Record-ID: test-metadata-not-warc-fields
 Content-Type: not-application/warc-fields
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
+WARC-Record-ID: test-revisit-profile-unknown
 WARC-Profile: none
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
+WARC-Record-ID: test-revisit-profile-future
 WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
+WARC-Record-ID: test-revisit-profile-good
 WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: conversion
+WARC-Record-ID: test-conversion
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: continuation
+WARC-Record-ID: test-continuation-segment-1
 WARC-Segment-Number: 1
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: continuation
+WARC-Record-ID: test-continuation-segment-valid
 WARC-Segment-Number: 2
 Content-Length: 0
 
diff --git a/test/test_tests.py b/test/test_tests.py
index dab1e669..723b2bd9 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -1,5 +1,3 @@
-import six
-
 from warcio.cli import main
 from warcio.utils import to_native_str
 import warcio.tester
@@ -32,28 +30,6 @@ def remove_before_test_data(s):
     return ret
 
 
-def test_torture_missing():
-    files = ['standard-torture-missing.warc']
-    files = [get_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/standard-torture-missing.warc
-  WARC-Record-ID None
-    WARC-Type warcinfo
-    digest not present
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-    recommendation: warcinfo Content-Type of application/warc-fields, saw none
-"""
-
-    value = helper(args, 0)
-    assert remove_before_test_data(value) == expected
-
-
 def test_torture_validate_record():
     files = ['standard-torture-validate-record.warc']
     files = [get_test_file(filename) for filename in files]
@@ -75,110 +51,117 @@ def test_torture_validate_record():
     comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
     comment: Missing field-name : in warc-fields line: no colon
     comment: invalid warc-fields name: token cannot have a space
-  WARC-Record-ID None
+  WARC-Record-ID test-empty-warc-fields
     WARC-Type warcinfo
     digest not present
+    error: uri must be within <> warc-record-id test-empty-warc-fields
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     comment: warc-fields body present but empty
-  WARC-Record-ID None
+  WARC-Record-ID test-warcinfo-non-recommended-content-type
+    WARC-Type warcinfo
+    digest not present
+    error: uri must be within <> warc-record-id test-warcinfo-non-recommended-content-type
+    error: missing required header WARC-Date
+    recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields
+  WARC-Record-ID test-response-content-type
     WARC-Type response
     digest not present
+    error: uri must be within <> warc-record-id test-response-content-type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https responses
-  WARC-Record-ID None
+  WARC-Record-ID test-resource-dns-content-type
     WARC-Type resource
     digest not present
+    error: uri must be within <> warc-record-id test-resource-dns-content-type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: recource records for dns: shall have Content-Type of text/dns, saw text/plain
-  WARC-Record-ID None
+  WARC-Record-ID test-resource-dns-empty
     WARC-Type resource
     digest not present
+    error: uri must be within <> warc-record-id test-resource-dns-empty
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-  WARC-Record-ID None
+    comment: unknown field, no validation performed WARC-Test-TODO add another with valid block
+  WARC-Record-ID test-resource-not-dns
     WARC-Type resource
     digest not present
+    error: uri must be within <> warc-record-id test-resource-not-dns
     error: missing required header Content-Type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-  WARC-Record-ID None
+  WARC-Record-ID test-request-unrecommended-content-type
     WARC-Type request
     digest not present
+    error: uri must be within <> warc-record-id test-request-unrecommended-content-type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https requests
-  WARC-Record-ID None
+  WARC-Record-ID test-request-unrecommended-content-type-with-ip
     WARC-Type request
     digest not present
+    error: uri must be within <> warc-record-id test-request-unrecommended-content-type-with-ip
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
-  WARC-Record-ID None
+  WARC-Record-ID test-metadata-warc-fields-empty
     WARC-Type metadata
     digest not present
+    error: uri must be within <> warc-record-id test-metadata-warc-fields-empty
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     comment: warc-fields body present but empty
-  WARC-Record-ID None
+  WARC-Record-ID test-metadata-not-warc-fields
     WARC-Type metadata
     digest not present
+    error: uri must be within <> warc-record-id test-metadata-not-warc-fields
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-  WARC-Record-ID None
+  WARC-Record-ID test-revisit-profile-unknown
     WARC-Type revisit
     digest not present
+    error: uri must be within <> warc-record-id test-revisit-profile-unknown
     error: missing required header Content-Type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Target-URI
     comment: extension seen warc-profile none
     comment: no revisit details validation done due to unknown profile
-  WARC-Record-ID None
+  WARC-Record-ID test-revisit-profile-future
     WARC-Type revisit
     digest not present
+    error: uri must be within <> warc-record-id test-revisit-profile-future
     error: missing required header Content-Type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Target-URI
     error: missing required header WARC-Payload-Digest
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
     recommendation: missing recommended header WARC-Refers-To-Target-URI
     comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
-  WARC-Record-ID None
+  WARC-Record-ID test-revisit-profile-good
     WARC-Type revisit
     digest not present
+    error: uri must be within <> warc-record-id test-revisit-profile-good
     error: missing required header Content-Type
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Target-URI
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
-  WARC-Record-ID None
+  WARC-Record-ID test-conversion
     WARC-Type conversion
     digest not present
+    error: uri must be within <> warc-record-id test-conversion
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Target-URI
-  WARC-Record-ID None
+  WARC-Record-ID test-continuation-segment-1
     WARC-Type continuation
     digest not present
+    error: uri must be within <> warc-record-id test-continuation-segment-1
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Segment-Origin-ID
     error: missing required header WARC-Target-URI
     error: continuation record must have WARC-Segment-Number > 1, saw 1
     comment: warcio test continuation code has not been tested, expect bugs
-  WARC-Record-ID None
+  WARC-Record-ID test-continuation-segment-valid
     WARC-Type continuation
     digest not present
+    error: uri must be within <> warc-record-id test-continuation-segment-valid
     error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
     error: missing required header WARC-Segment-Origin-ID
     error: missing required header WARC-Target-URI
     comment: warcio test continuation code has not been tested, expect bugs
diff --git a/warcio/tester.py b/warcio/tester.py
index f00479ff..e9755c8c 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -125,7 +125,7 @@ def validate_warc_fields(record, commentary):
 def validate_warcinfo(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
-        commentary.recommendation('warcinfo Content-Type of application/warc-fields, saw', content_type)
+        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw', content_type)
     else:
         #   format: warc-fields
         #   allowable fields include but not limited to DMCI plus the following

From d1fe18edb4220acb53b5304fde7e321679c7c42d Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 12:19:35 -0800
Subject: [PATCH 50/68] preserve capitalization in messages

---
 test/test_tests.py | 142 ++++++++++++++++++++++-----------------------
 warcio/tester.py   |  15 +++--
 2 files changed, 78 insertions(+), 79 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index 723b2bd9..c922eff1 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -42,7 +42,7 @@ def test_torture_validate_record():
   WARC-Record-ID None
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> warc-refers-to probhibited
+    error: uri must be within <> WARC-Refers-To probhibited
     error: missing required header WARC-Date
     error: missing required header WARC-Record-ID
     error: field not allowed in record_type WARC-Refers-To warcinfo
@@ -54,77 +54,77 @@ def test_torture_validate_record():
   WARC-Record-ID test-empty-warc-fields
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> warc-record-id test-empty-warc-fields
+    error: uri must be within <> WARC-Record-ID test-empty-warc-fields
     error: missing required header WARC-Date
     comment: warc-fields body present but empty
   WARC-Record-ID test-warcinfo-non-recommended-content-type
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> warc-record-id test-warcinfo-non-recommended-content-type
+    error: uri must be within <> WARC-Record-ID test-warcinfo-non-recommended-content-type
     error: missing required header WARC-Date
     recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields
   WARC-Record-ID test-response-content-type
     WARC-Type response
     digest not present
-    error: uri must be within <> warc-record-id test-response-content-type
+    error: uri must be within <> WARC-Record-ID test-response-content-type
     error: missing required header WARC-Date
     error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https responses
   WARC-Record-ID test-resource-dns-content-type
     WARC-Type resource
     digest not present
-    error: uri must be within <> warc-record-id test-resource-dns-content-type
+    error: uri must be within <> WARC-Record-ID test-resource-dns-content-type
     error: missing required header WARC-Date
     error: recource records for dns: shall have Content-Type of text/dns, saw text/plain
   WARC-Record-ID test-resource-dns-empty
     WARC-Type resource
     digest not present
-    error: uri must be within <> warc-record-id test-resource-dns-empty
+    error: uri must be within <> WARC-Record-ID test-resource-dns-empty
     error: missing required header WARC-Date
     comment: unknown field, no validation performed WARC-Test-TODO add another with valid block
   WARC-Record-ID test-resource-not-dns
     WARC-Type resource
     digest not present
-    error: uri must be within <> warc-record-id test-resource-not-dns
+    error: uri must be within <> WARC-Record-ID test-resource-not-dns
     error: missing required header Content-Type
     error: missing required header WARC-Date
   WARC-Record-ID test-request-unrecommended-content-type
     WARC-Type request
     digest not present
-    error: uri must be within <> warc-record-id test-request-unrecommended-content-type
+    error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type
     error: missing required header WARC-Date
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
     error: WARC-IP-Address should be used for http and https requests
   WARC-Record-ID test-request-unrecommended-content-type-with-ip
     WARC-Type request
     digest not present
-    error: uri must be within <> warc-record-id test-request-unrecommended-content-type-with-ip
+    error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type-with-ip
     error: missing required header WARC-Date
     error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
   WARC-Record-ID test-metadata-warc-fields-empty
     WARC-Type metadata
     digest not present
-    error: uri must be within <> warc-record-id test-metadata-warc-fields-empty
+    error: uri must be within <> WARC-Record-ID test-metadata-warc-fields-empty
     error: missing required header WARC-Date
     comment: warc-fields body present but empty
   WARC-Record-ID test-metadata-not-warc-fields
     WARC-Type metadata
     digest not present
-    error: uri must be within <> warc-record-id test-metadata-not-warc-fields
+    error: uri must be within <> WARC-Record-ID test-metadata-not-warc-fields
     error: missing required header WARC-Date
   WARC-Record-ID test-revisit-profile-unknown
     WARC-Type revisit
     digest not present
-    error: uri must be within <> warc-record-id test-revisit-profile-unknown
+    error: uri must be within <> WARC-Record-ID test-revisit-profile-unknown
     error: missing required header Content-Type
     error: missing required header WARC-Date
     error: missing required header WARC-Target-URI
-    comment: extension seen warc-profile none
+    comment: extension seen WARC-Profile none
     comment: no revisit details validation done due to unknown profile
   WARC-Record-ID test-revisit-profile-future
     WARC-Type revisit
     digest not present
-    error: uri must be within <> warc-record-id test-revisit-profile-future
+    error: uri must be within <> WARC-Record-ID test-revisit-profile-future
     error: missing required header Content-Type
     error: missing required header WARC-Date
     error: missing required header WARC-Target-URI
@@ -132,11 +132,11 @@ def test_torture_validate_record():
     recommendation: missing recommended header WARC-Refers-To
     recommendation: missing recommended header WARC-Refers-To-Date
     recommendation: missing recommended header WARC-Refers-To-Target-URI
-    comment: extension seen warc-profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+    comment: extension seen WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
   WARC-Record-ID test-revisit-profile-good
     WARC-Type revisit
     digest not present
-    error: uri must be within <> warc-record-id test-revisit-profile-good
+    error: uri must be within <> WARC-Record-ID test-revisit-profile-good
     error: missing required header Content-Type
     error: missing required header WARC-Date
     error: missing required header WARC-Target-URI
@@ -145,13 +145,13 @@ def test_torture_validate_record():
   WARC-Record-ID test-conversion
     WARC-Type conversion
     digest not present
-    error: uri must be within <> warc-record-id test-conversion
+    error: uri must be within <> WARC-Record-ID test-conversion
     error: missing required header WARC-Date
     error: missing required header WARC-Target-URI
   WARC-Record-ID test-continuation-segment-1
     WARC-Type continuation
     digest not present
-    error: uri must be within <> warc-record-id test-continuation-segment-1
+    error: uri must be within <> WARC-Record-ID test-continuation-segment-1
     error: missing required header WARC-Date
     error: missing required header WARC-Segment-Origin-ID
     error: missing required header WARC-Target-URI
@@ -160,7 +160,7 @@ def test_torture_validate_record():
   WARC-Record-ID test-continuation-segment-valid
     WARC-Type continuation
     digest not present
-    error: uri must be within <> warc-record-id test-continuation-segment-valid
+    error: uri must be within <> WARC-Record-ID test-continuation-segment-valid
     error: missing required header WARC-Date
     error: missing required header WARC-Segment-Origin-ID
     error: missing required header WARC-Target-URI
@@ -187,64 +187,64 @@ def test_torture_validate_field():
   WARC-Record-ID <foo:bar>
     WARC-Type does-not-exist
     unknown hash algorithm name in block digest
-    error: uri must not be within <> warc-target-uri <http://example.com/>
-    error: invalid uri scheme, bad character warc-target-uri <http://example.com/>
-    error: duplicate field seen warc-target-uri example.com
-    error: invalid uri, no scheme warc-target-uri example.com
-    error: duplicate field seen warc-target-uri ex ample.com
-    error: invalid uri, no scheme warc-target-uri ex ample.com
-    error: invalid uri, contains whitespace warc-target-uri ex ample.com
-    error: invalid uri scheme, bad character warc-target-uri ex ample.com
-    error: duplicate field seen warc-target-uri h<>ttp://example.com/
-    error: invalid uri scheme, bad character warc-target-uri h<>ttp://example.com/
-    error: duplicate field seen warc-type CAPITALIZED
-    error: uri must be within <> warc-concurrent-to http://example.com/
-    error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
-    error: WARC 1.0 may not have fractional seconds warc-date 2017-03-06T04:03:53.Z
-    error: must contain a / content-type asdf
-    error: invalid subtype content-type asdf
-    error: duplicate field seen content-type has space/asdf
-    error: invalid type content-type has space/asdf
-    error: duplicate field seen content-type asdf/has space
-    error: invalid subtype content-type asdf/has space
-    error: duplicate field seen content-type asdf/has space;asdf
-    error: invalid subtype content-type asdf/has space;asdf
-    error: missing algorithm warc-block-digest asdf
-    error: duplicate field seen warc-block-digest has space:asdf
-    error: invalid algorithm warc-block-digest has space:asdf
-    error: duplicate field seen warc-block-digest sha1:&$*^&*^#*&^
-    error: invalid ip warc-ip-address 1.2.3.4.5
-    error: uri must be within <> warc-warcinfo-id asdf:asdf
-    error: duplicate field seen warc-profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
-    error: must contain a / warc-identified-payload-type asdf
-    error: invalid subtype warc-identified-payload-type asdf
-    error: uri must be within <> warc-segment-origin-id http://example.com
-    error: must be an integer warc-segment-number not-an-integer
-    error: duplicate field seen warc-segment-number 0
-    error: must be 1 or greater warc-segment-number 0
-    error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 0
-    error: duplicate field seen warc-segment-number 1
-    error: duplicate field seen warc-segment-number 2
-    error: non-continuation records must always have WARC-Segment-Number = 1 warc-segment-number 2
-    error: duplicate field seen warc-segment-total-length not-an-integer
-    error: must be an integer warc-segment-total-length not-an-integer
-    comment: unknown WARC-Type warc-type does-not-exist
-    comment: WARC-Type is not lower-case warc-type CAPITALIZED
-    comment: unknown WARC-Type warc-type CAPITALIZED
-    comment: unknown digest algorithm warc-block-digest asdf
-    comment: Invalid-looking digest value warc-block-digest sha1:&$*^&*^#*&^
-    comment: extension seen warc-truncated invalid
-    comment: extension seen warc-profile asdf
+    error: uri must not be within <> WARC-Target-URI <http://example.com/>
+    error: invalid uri scheme, bad character WARC-Target-URI <http://example.com/>
+    error: duplicate field seen WARC-Target-URI example.com
+    error: invalid uri, no scheme WARC-Target-URI example.com
+    error: duplicate field seen WARC-Target-URI ex ample.com
+    error: invalid uri, no scheme WARC-Target-URI ex ample.com
+    error: invalid uri, contains whitespace WARC-Target-URI ex ample.com
+    error: invalid uri scheme, bad character WARC-Target-URI ex ample.com
+    error: duplicate field seen WARC-Target-URI h<>ttp://example.com/
+    error: invalid uri scheme, bad character WARC-Target-URI h<>ttp://example.com/
+    error: duplicate field seen WARC-Type CAPITALIZED
+    error: uri must be within <> WARC-Concurrent-To http://example.com/
+    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z
+    error: WARC 1.0 may not have fractional seconds WARC-Date 2017-03-06T04:03:53.Z
+    error: must contain a / Content-Type asdf
+    error: invalid subtype Content-Type asdf
+    error: duplicate field seen Content-Type has space/asdf
+    error: invalid type Content-Type has space/asdf
+    error: duplicate field seen Content-Type asdf/has space
+    error: invalid subtype Content-Type asdf/has space
+    error: duplicate field seen Content-Type asdf/has space;asdf
+    error: invalid subtype Content-Type asdf/has space;asdf
+    error: missing algorithm WARC-Block-Digest asdf
+    error: duplicate field seen WARC-Block-Digest has space:asdf
+    error: invalid algorithm WARC-Block-Digest has space:asdf
+    error: duplicate field seen WARC-Block-Digest sha1:&$*^&*^#*&^
+    error: invalid ip WARC-IP-Address 1.2.3.4.5
+    error: uri must be within <> WARC-Warcinfo-ID asdf:asdf
+    error: duplicate field seen WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    error: must contain a / WARC-Identified-Payload-Type asdf
+    error: invalid subtype WARC-Identified-Payload-Type asdf
+    error: uri must be within <> WARC-Segment-Origin-ID http://example.com
+    error: must be an integer WARC-Segment-Number not-an-integer
+    error: duplicate field seen WARC-Segment-Number 0
+    error: must be 1 or greater WARC-Segment-Number 0
+    error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 0
+    error: duplicate field seen WARC-Segment-Number 1
+    error: duplicate field seen WARC-Segment-Number 2
+    error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 2
+    error: duplicate field seen WARC-Segment-Total-Length not-an-integer
+    error: must be an integer WARC-Segment-Total-Length not-an-integer
+    comment: unknown WARC-Type WARC-Type does-not-exist
+    comment: WARC-Type is not lower-case WARC-Type CAPITALIZED
+    comment: unknown WARC-Type WARC-Type CAPITALIZED
+    comment: unknown digest algorithm WARC-Block-Digest asdf
+    comment: Invalid-looking digest value WARC-Block-Digest sha1:&$*^&*^#*&^
+    comment: extension seen WARC-Truncated invalid
+    comment: extension seen WARC-Profile asdf
     comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0
     comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0
     comment: unknown field, no validation performed WARC-Unknown-Field asdf
   WARC-Record-ID None
     WARC-Type invalid
     digest not present
-    error: duplicate field seen warc-date 2017-03-06T04:03:53.Z
-    error: fractional seconds must have 1-9 digits warc-date 2017-03-06T04:03:53.Z
-    error: duplicate field seen warc-date 2017-03-06T04:03:53.0Z
-    comment: unknown WARC-Type warc-type invalid
+    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z
+    error: fractional seconds must have 1-9 digits WARC-Date 2017-03-06T04:03:53.Z
+    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.0Z
+    comment: unknown WARC-Type WARC-Type invalid
   WARC-Record-ID None
     WARC-Type request
     digest not present
diff --git a/warcio/tester.py b/warcio/tester.py
index e9755c8c..2300d062 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -603,19 +603,18 @@ def validate_record(record):
 
     seen_fields = set()
     for field, value in record.rec_headers.headers:
-        field_case = field
-        field = field.lower()
-        if field != 'warc-concurrent-to' and field in seen_fields:
+        field_l = field.lower()
+        if field != 'warc-concurrent-to' and field_l in seen_fields:
             commentary.error('duplicate field seen', field, value)
-        seen_fields.add(field)
-        if field not in warc_fields:
-            commentary.comment('unknown field, no validation performed', field_case, value)
+        seen_fields.add(field_l)
+        if field_l not in warc_fields:
+            commentary.comment('unknown field, no validation performed', field, value)
             continue
-        config = warc_fields[field]
+        config = warc_fields[field_l]
         if 'minver' in config:
             if version < config['minver']:
                 # unknown fields are extensions, so this is a comment and not an error
-                commentary.comment('field was introduced after this warc version', field_case, value, version)
+                commentary.comment('field was introduced after this warc version', field, value, version)
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 

From 484da9c4cad5f058cec41f68d46b7e31cfd74fef Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 12:55:48 -0800
Subject: [PATCH 51/68] capitals and colons

---
 test/test_tests.py | 264 ++++++++++++++++++++++-----------------------
 warcio/tester.py   |  88 +++++++--------
 2 files changed, 177 insertions(+), 175 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index c922eff1..91eba656 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -42,128 +42,128 @@ def test_torture_validate_record():
   WARC-Record-ID None
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> WARC-Refers-To probhibited
-    error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-    error: field not allowed in record_type WARC-Refers-To warcinfo
+    error: uri must be within <>: WARC-Refers-To probhibited
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Record-ID
+    error: field not allowed in record type: warcinfo WARC-Refers-To
     error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
     comment: The first line of warc-fields cannot start with whitespace
     comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
-    comment: Missing field-name : in warc-fields line: no colon
-    comment: invalid warc-fields name: token cannot have a space
+    comment: Missing colon in warc-fields line: no colon
+    comment: Invalid warc-fields name: token cannot have a space
   WARC-Record-ID test-empty-warc-fields
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> WARC-Record-ID test-empty-warc-fields
-    error: missing required header WARC-Date
+    error: uri must be within <>: WARC-Record-ID test-empty-warc-fields
+    error: missing required header: WARC-Date
     comment: warc-fields body present but empty
   WARC-Record-ID test-warcinfo-non-recommended-content-type
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <> WARC-Record-ID test-warcinfo-non-recommended-content-type
-    error: missing required header WARC-Date
-    recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw not-application/warc-fields
+    error: uri must be within <>: WARC-Record-ID test-warcinfo-non-recommended-content-type
+    error: missing required header: WARC-Date
+    recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw: not-application/warc-fields
   WARC-Record-ID test-response-content-type
     WARC-Type response
     digest not present
-    error: uri must be within <> WARC-Record-ID test-response-content-type
-    error: missing required header WARC-Date
-    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw text/plain
+    error: uri must be within <>: WARC-Record-ID test-response-content-type
+    error: missing required header: WARC-Date
+    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw: text/plain
     error: WARC-IP-Address should be used for http and https responses
   WARC-Record-ID test-resource-dns-content-type
     WARC-Type resource
     digest not present
-    error: uri must be within <> WARC-Record-ID test-resource-dns-content-type
-    error: missing required header WARC-Date
-    error: recource records for dns: shall have Content-Type of text/dns, saw text/plain
+    error: uri must be within <>: WARC-Record-ID test-resource-dns-content-type
+    error: missing required header: WARC-Date
+    error: recource records for dns: shall have Content-Type of text/dns, saw: text/plain
   WARC-Record-ID test-resource-dns-empty
     WARC-Type resource
     digest not present
-    error: uri must be within <> WARC-Record-ID test-resource-dns-empty
-    error: missing required header WARC-Date
-    comment: unknown field, no validation performed WARC-Test-TODO add another with valid block
+    error: uri must be within <>: WARC-Record-ID test-resource-dns-empty
+    error: missing required header: WARC-Date
+    comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block
   WARC-Record-ID test-resource-not-dns
     WARC-Type resource
     digest not present
-    error: uri must be within <> WARC-Record-ID test-resource-not-dns
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
+    error: uri must be within <>: WARC-Record-ID test-resource-not-dns
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
   WARC-Record-ID test-request-unrecommended-content-type
     WARC-Type request
     digest not present
-    error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type
-    error: missing required header WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
+    error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type
+    error: missing required header: WARC-Date
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain
     error: WARC-IP-Address should be used for http and https requests
   WARC-Record-ID test-request-unrecommended-content-type-with-ip
     WARC-Type request
     digest not present
-    error: uri must be within <> WARC-Record-ID test-request-unrecommended-content-type-with-ip
-    error: missing required header WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw text/plain
+    error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type-with-ip
+    error: missing required header: WARC-Date
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain
   WARC-Record-ID test-metadata-warc-fields-empty
     WARC-Type metadata
     digest not present
-    error: uri must be within <> WARC-Record-ID test-metadata-warc-fields-empty
-    error: missing required header WARC-Date
+    error: uri must be within <>: WARC-Record-ID test-metadata-warc-fields-empty
+    error: missing required header: WARC-Date
     comment: warc-fields body present but empty
   WARC-Record-ID test-metadata-not-warc-fields
     WARC-Type metadata
     digest not present
-    error: uri must be within <> WARC-Record-ID test-metadata-not-warc-fields
-    error: missing required header WARC-Date
+    error: uri must be within <>: WARC-Record-ID test-metadata-not-warc-fields
+    error: missing required header: WARC-Date
   WARC-Record-ID test-revisit-profile-unknown
     WARC-Type revisit
     digest not present
-    error: uri must be within <> WARC-Record-ID test-revisit-profile-unknown
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Target-URI
-    comment: extension seen WARC-Profile none
-    comment: no revisit details validation done due to unknown profile
+    error: uri must be within <>: WARC-Record-ID test-revisit-profile-unknown
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Target-URI
+    comment: extension seen: WARC-Profile none
+    comment: no revisit details validation done due to unknown profile: none
   WARC-Record-ID test-revisit-profile-future
     WARC-Type revisit
     digest not present
-    error: uri must be within <> WARC-Record-ID test-revisit-profile-future
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Target-URI
-    error: missing required header WARC-Payload-Digest
-    recommendation: missing recommended header WARC-Refers-To
-    recommendation: missing recommended header WARC-Refers-To-Date
-    recommendation: missing recommended header WARC-Refers-To-Target-URI
-    comment: extension seen WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+    error: uri must be within <>: WARC-Record-ID test-revisit-profile-future
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Target-URI
+    error: missing required header: WARC-Payload-Digest
+    recommendation: missing recommended header: WARC-Refers-To
+    recommendation: missing recommended header: WARC-Refers-To-Date
+    recommendation: missing recommended header: WARC-Refers-To-Target-URI
+    comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
   WARC-Record-ID test-revisit-profile-good
     WARC-Type revisit
     digest not present
-    error: uri must be within <> WARC-Record-ID test-revisit-profile-good
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Target-URI
-    recommendation: missing recommended header WARC-Refers-To
-    recommendation: missing recommended header WARC-Refers-To-Date
+    error: uri must be within <>: WARC-Record-ID test-revisit-profile-good
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Target-URI
+    recommendation: missing recommended header: WARC-Refers-To
+    recommendation: missing recommended header: WARC-Refers-To-Date
   WARC-Record-ID test-conversion
     WARC-Type conversion
     digest not present
-    error: uri must be within <> WARC-Record-ID test-conversion
-    error: missing required header WARC-Date
-    error: missing required header WARC-Target-URI
+    error: uri must be within <>: WARC-Record-ID test-conversion
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Target-URI
   WARC-Record-ID test-continuation-segment-1
     WARC-Type continuation
     digest not present
-    error: uri must be within <> WARC-Record-ID test-continuation-segment-1
-    error: missing required header WARC-Date
-    error: missing required header WARC-Segment-Origin-ID
-    error: missing required header WARC-Target-URI
-    error: continuation record must have WARC-Segment-Number > 1, saw 1
+    error: uri must be within <>: WARC-Record-ID test-continuation-segment-1
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Segment-Origin-ID
+    error: missing required header: WARC-Target-URI
+    error: continuation record must have WARC-Segment-Number > 1, saw: 1
     comment: warcio test continuation code has not been tested, expect bugs
   WARC-Record-ID test-continuation-segment-valid
     WARC-Type continuation
     digest not present
-    error: uri must be within <> WARC-Record-ID test-continuation-segment-valid
-    error: missing required header WARC-Date
-    error: missing required header WARC-Segment-Origin-ID
-    error: missing required header WARC-Target-URI
+    error: uri must be within <>: WARC-Record-ID test-continuation-segment-valid
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Segment-Origin-ID
+    error: missing required header: WARC-Target-URI
     comment: warcio test continuation code has not been tested, expect bugs
 """
 
@@ -187,73 +187,73 @@ def test_torture_validate_field():
   WARC-Record-ID <foo:bar>
     WARC-Type does-not-exist
     unknown hash algorithm name in block digest
-    error: uri must not be within <> WARC-Target-URI <http://example.com/>
-    error: invalid uri scheme, bad character WARC-Target-URI <http://example.com/>
-    error: duplicate field seen WARC-Target-URI example.com
-    error: invalid uri, no scheme WARC-Target-URI example.com
-    error: duplicate field seen WARC-Target-URI ex ample.com
-    error: invalid uri, no scheme WARC-Target-URI ex ample.com
-    error: invalid uri, contains whitespace WARC-Target-URI ex ample.com
-    error: invalid uri scheme, bad character WARC-Target-URI ex ample.com
-    error: duplicate field seen WARC-Target-URI h<>ttp://example.com/
-    error: invalid uri scheme, bad character WARC-Target-URI h<>ttp://example.com/
-    error: duplicate field seen WARC-Type CAPITALIZED
-    error: uri must be within <> WARC-Concurrent-To http://example.com/
-    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z
-    error: WARC 1.0 may not have fractional seconds WARC-Date 2017-03-06T04:03:53.Z
-    error: must contain a / Content-Type asdf
-    error: invalid subtype Content-Type asdf
-    error: duplicate field seen Content-Type has space/asdf
-    error: invalid type Content-Type has space/asdf
-    error: duplicate field seen Content-Type asdf/has space
-    error: invalid subtype Content-Type asdf/has space
-    error: duplicate field seen Content-Type asdf/has space;asdf
-    error: invalid subtype Content-Type asdf/has space;asdf
-    error: missing algorithm WARC-Block-Digest asdf
-    error: duplicate field seen WARC-Block-Digest has space:asdf
-    error: invalid algorithm WARC-Block-Digest has space:asdf
-    error: duplicate field seen WARC-Block-Digest sha1:&$*^&*^#*&^
-    error: invalid ip WARC-IP-Address 1.2.3.4.5
-    error: uri must be within <> WARC-Warcinfo-ID asdf:asdf
-    error: duplicate field seen WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
-    error: must contain a / WARC-Identified-Payload-Type asdf
-    error: invalid subtype WARC-Identified-Payload-Type asdf
-    error: uri must be within <> WARC-Segment-Origin-ID http://example.com
-    error: must be an integer WARC-Segment-Number not-an-integer
-    error: duplicate field seen WARC-Segment-Number 0
-    error: must be 1 or greater WARC-Segment-Number 0
-    error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 0
-    error: duplicate field seen WARC-Segment-Number 1
-    error: duplicate field seen WARC-Segment-Number 2
-    error: non-continuation records must always have WARC-Segment-Number = 1 WARC-Segment-Number 2
-    error: duplicate field seen WARC-Segment-Total-Length not-an-integer
-    error: must be an integer WARC-Segment-Total-Length not-an-integer
-    comment: unknown WARC-Type WARC-Type does-not-exist
-    comment: WARC-Type is not lower-case WARC-Type CAPITALIZED
-    comment: unknown WARC-Type WARC-Type CAPITALIZED
-    comment: unknown digest algorithm WARC-Block-Digest asdf
-    comment: Invalid-looking digest value WARC-Block-Digest sha1:&$*^&*^#*&^
-    comment: extension seen WARC-Truncated invalid
-    comment: extension seen WARC-Profile asdf
-    comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com 1.0
-    comment: field was introduced after this warc version WARC-Refers-To-Date not-a-date 1.0
-    comment: unknown field, no validation performed WARC-Unknown-Field asdf
+    error: uri must not be within <>: WARC-Target-URI <http://example.com/>
+    error: invalid uri scheme, bad character: WARC-Target-URI <http://example.com/>
+    error: duplicate field seen: WARC-Target-URI example.com
+    error: invalid uri, no scheme: WARC-Target-URI example.com
+    error: duplicate field seen: WARC-Target-URI ex ample.com
+    error: invalid uri, no scheme: WARC-Target-URI ex ample.com
+    error: invalid uri, contains whitespace: WARC-Target-URI ex ample.com
+    error: invalid uri scheme, bad character: WARC-Target-URI ex ample.com
+    error: duplicate field seen: WARC-Target-URI h<>ttp://example.com/
+    error: invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/
+    error: duplicate field seen: WARC-Type CAPITALIZED
+    error: uri must be within <>: WARC-Concurrent-To http://example.com/
+    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
+    error: WARC 1.0 time may not have fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
+    error: must contain a /: Content-Type asdf
+    error: invalid subtype: Content-Type asdf
+    error: duplicate field seen: Content-Type has space/asdf
+    error: invalid type: Content-Type has space/asdf
+    error: duplicate field seen: Content-Type asdf/has space
+    error: invalid subtype: Content-Type asdf/has space
+    error: duplicate field seen: Content-Type asdf/has space;asdf
+    error: invalid subtype: Content-Type asdf/has space;asdf
+    error: missing algorithm: WARC-Block-Digest asdf
+    error: duplicate field seen: WARC-Block-Digest has space:asdf
+    error: invalid algorithm: WARC-Block-Digest has space:asdf
+    error: duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^
+    error: invalid ip: WARC-IP-Address 1.2.3.4.5
+    error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf
+    error: duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    error: must contain a /: WARC-Identified-Payload-Type asdf
+    error: invalid subtype: WARC-Identified-Payload-Type asdf
+    error: uri must be within <>: WARC-Segment-Origin-ID http://example.com
+    error: must be an integer: WARC-Segment-Number not-an-integer
+    error: duplicate field seen: WARC-Segment-Number 0
+    error: must be 1 or greater: WARC-Segment-Number 0
+    error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0
+    error: duplicate field seen: WARC-Segment-Number 1
+    error: duplicate field seen: WARC-Segment-Number 2
+    error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2
+    error: duplicate field seen: WARC-Segment-Total-Length not-an-integer
+    error: must be an integer: WARC-Segment-Total-Length not-an-integer
+    comment: unknown WARC-Type: WARC-Type does-not-exist
+    comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED
+    comment: unknown WARC-Type: WARC-Type CAPITALIZED
+    comment: unknown digest algorithm: WARC-Block-Digest asdf
+    comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^
+    comment: extension seen: WARC-Truncated invalid
+    comment: extension seen: WARC-Profile asdf
+    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com
+    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date
+    comment: unknown field, no validation performed: WARC-Unknown-Field asdf
   WARC-Record-ID None
     WARC-Type invalid
     digest not present
-    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.Z
-    error: fractional seconds must have 1-9 digits WARC-Date 2017-03-06T04:03:53.Z
-    error: duplicate field seen WARC-Date 2017-03-06T04:03:53.0Z
-    comment: unknown WARC-Type WARC-Type invalid
+    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
+    error: fractional seconds must have 1-9 digits: WARC-Date 2017-03-06T04:03:53.Z
+    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z
+    comment: unknown WARC-Type: WARC-Type invalid
   WARC-Record-ID None
     WARC-Type request
     digest not present
-    error: missing required header Content-Type
-    error: missing required header WARC-Date
-    error: missing required header WARC-Record-ID
-    error: missing required header WARC-Target-URI
+    error: missing required header: Content-Type
+    error: missing required header: WARC-Date
+    error: missing required header: WARC-Record-ID
+    error: missing required header: WARC-Target-URI
     recommendation: do not segment WARC-Type request
-    comment: no configuration seen for WARC-Segment-Number request
+    comment: Unknown field for this record type, perhaps an extension: request WARC-Segment-Number
 """
 
     value = helper(args, 0)
@@ -312,9 +312,9 @@ def test_digests():
   WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
     WARC-Type revisit
     digest present but not checked
-    recommendation: missing recommended header WARC-Refers-To
-    comment: field was introduced after this warc version WARC-Refers-To-Target-URI http://example.com/ 1.0
-    comment: field was introduced after this warc version WARC-Refers-To-Date 2017-03-06T04:02:06Z 1.0
+    recommendation: missing recommended header: WARC-Refers-To
+    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
+    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
   WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest not present
@@ -330,14 +330,14 @@ def test_leftovers():
     assert not commentary.has_comments()
 
     # hard to test because invalid WARC Content-Length raises in archiveiterator
-    warcio.tester.validate_content_length('content-length', 'not-an-integer', None, '1.0', commentary, None)
+    warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None)
 
     # hard to test because warcio checks the WARC version
     warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
 
     expected = '''\
-error: must be an integer content-length not-an-integer
-comment: no profile check because unknown warc version blah blah
+error: must be an integer: Content-Length not-an-integer
+comment: no profile check because unknown warc version: blah blah
 '''
 
     assert '\n'.join(commentary.comments())+'\n' == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index 2300d062..4ee05f1f 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -59,7 +59,8 @@ def __getattr__(self, name):
 
 
 def canon_content_type(s):
-    return s.lower().replace('; ', ';')
+    # wget omits the space after the ;, let that pass
+    return s.lower().replace(';msgtype=', '; msgtype=')
 
 
 def validate_warc_fields(record, commentary):
@@ -106,11 +107,11 @@ def validate_warc_fields(record, commentary):
         else:
             # check for field-name :
             if ':' not in line:
-                commentary.comment('Missing field-name : in warc-fields line:', line)
+                commentary.comment('Missing colon in warc-fields line:', line)
             else:
                 field_name = line.split(':', 1)[0]
                 if not re.search(token_re, field_name):
-                    commentary.comment('invalid warc-fields name:', field_name)
+                    commentary.comment('Invalid warc-fields name:', field_name)
                 else:
                     lines.append(line)
         first_line = False
@@ -125,7 +126,7 @@ def validate_warc_fields(record, commentary):
 def validate_warcinfo(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
-        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw', content_type)
+        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw:', content_type)
     else:
         #   format: warc-fields
         #   allowable fields include but not limited to DMCI plus the following
@@ -145,8 +146,8 @@ def validate_response(record, commentary, pending):
 
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
-        if canon_content_type(content_type) not in {'application/http;msgtype=response', 'application/http'}:
-            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw', content_type)
+        if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}:
+            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
@@ -163,7 +164,7 @@ def validate_resource(record, commentary, pending):
     if target_uri.startswith('dns:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
         if content_type.lower() != 'text/dns':
-            commentary.error('recource records for dns: shall have Content-Type of text/dns, saw', content_type)
+            commentary.error('recource records for dns: shall have Content-Type of text/dns, saw:', content_type)
         else:
             # rfc 2540 and rfc 1035
             #validate_text_dns()
@@ -178,8 +179,8 @@ def validate_request(record, commentary, pending):
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type')
 
-        if canon_content_type(content_type) not in {'application/http;msgtype=request', 'application/http'}:
-            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw', content_type)
+        if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}:
+            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https requests')
@@ -225,7 +226,7 @@ def validate_revisit(record, commentary, pending):
         #     if yes, should be like a response record, truncated if desired
         #   WARC-Refers-To-Date should be the same as WARC-Date in the original record if present
     else:
-        commentary.comment('no revisit details validation done due to unknown profile')
+        commentary.comment('no revisit details validation done due to unknown profile:', warc_profile)
 
 
 def validate_conversion(record, commentary, pending):
@@ -239,7 +240,7 @@ def validate_continuation(record, commentary, pending):
 
     segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none')
     if segment_number.isdigit() and int(segment_number) < 2:
-        commentary.error('continuation record must have WARC-Segment-Number > 1, saw', segment_number)
+        commentary.error('continuation record must have WARC-Segment-Number > 1, saw:', segment_number)
 
     # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated
 
@@ -251,30 +252,30 @@ def validate_actual_uri(field, value, record, version, commentary, pending):
     # schemes are case-insensitive and normalize to lower
     if value.startswith('<') or value.endswith('>'):
         # wget 1.19 bug caused by WARC 1.0 spec error
-        commentary.error('uri must not be within <>', field, value)
+        commentary.error('uri must not be within <>:', field, value)
     if ':' not in value:
-        commentary.error('invalid uri, no scheme', field, value)
+        commentary.error('invalid uri, no scheme:', field, value)
     if re.search(r'\s', value):
-        commentary.error('invalid uri, contains whitespace', field, value)
+        commentary.error('invalid uri, contains whitespace:', field, value)
     scheme = value.split(':', 1)[0]
     if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
-        commentary.error('invalid uri scheme, bad character', field, value)
+        commentary.error('invalid uri scheme, bad character:', field, value)
     # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
 
 
 def validate_warc_type(field, value, record, version, commentary, pending):
     if not value.islower():
         # I am unclear if this is allowed? standard is silent
-        commentary.comment('WARC-Type is not lower-case', field, value)
+        commentary.comment('WARC-Type is not lower-case:', field, value)
     if value.lower() not in record_types:
         # standard says readers should ignore unknown warc-types
-        commentary.comment('unknown WARC-Type', field, value)
+        commentary.comment('unknown WARC-Type:', field, value)
 
 
 def validate_uri(field, value, record, version, commentary, pending):
     # < uri >
     if not (value.startswith('<') and value.endswith('>')):
-        commentary.error('uri must be within <>', field, value)
+        commentary.error('uri must be within <>:', field, value)
         return
     validate_actual_uri(field, value[1:-1], record, version, commentary, pending)
 
@@ -289,12 +290,12 @@ def validate_timestamp(field, value, record, version, commentary, pending):
     if not use_ms:
         if '.' in value:
             # XXX specification infelicity: would be nice to have 'advice to implementers' here
-            commentary.error('WARC 1.0 may not have fractional seconds', field, value)
+            commentary.error('WARC 1.0 time may not have fractional seconds:', field, value)
     else:
         if '.' in value:
             start, end = value.split('.', 1)
             if not re.search(r'\A[0-9]{1,9}Z\Z', end):
-                commentary.error('fractional seconds must have 1-9 digits', field, value)
+                commentary.error('fractional seconds must have 1-9 digits:', field, value)
 
     # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
 
@@ -304,7 +305,7 @@ def validate_timestamp(field, value, record, version, commentary, pending):
 
 def validate_content_length(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer', field, value)
+        commentary.error('must be an integer:', field, value)
 
 
 token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z'
@@ -313,7 +314,7 @@ def validate_content_length(field, value, record, version, commentary, pending):
 
 def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
-        commentary.error('must contain a /', field, value)
+        commentary.error('must contain a /:', field, value)
     splits = value.split('/', 1)
     ctype = splits[0]
     if len(splits) > 1:
@@ -321,13 +322,13 @@ def validate_content_type(field, value, record, version, commentary, pending):
     else:
         rest = ''
     if not re.search(token_re, ctype):
-        commentary.error('invalid type', field, value)
+        commentary.error('invalid type:', field, value)
     if ';' in rest:
         subtype, rest = rest.split(';', 1)
     else:
         subtype = rest
     if not re.search(token_re, subtype):
-        commentary.error('invalid subtype', field, value)
+        commentary.error('invalid subtype:', field, value)
 
     # at this point there can be multiple parameters,
     # some of which could have quoted string values with ; in them
@@ -337,7 +338,7 @@ def validate_content_type(field, value, record, version, commentary, pending):
 
 def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
-        commentary.error('missing algorithm', field, value)
+        commentary.error('missing algorithm:', field, value)
     splits = value.split(':', 1)
     algorithm = splits[0]
     if len(splits) > 1:
@@ -345,18 +346,19 @@ def validate_digest(field, value, record, version, commentary, pending):
     else:
         digest = 'none'
     if not re.search(token_re, algorithm):
-        commentary.error('invalid algorithm', field, value)
+        commentary.error('invalid algorithm:', field, value)
     else:
         try:
             Digester(algorithm)
         except ValueError:
-            commentary.comment('unknown digest algorithm', field, value)
+            commentary.comment('unknown digest algorithm:', field, value)
     if not re.search(token_re, digest):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
         pass
     if not re.search(digest_re, digest):
-        commentary.comment('Invalid-looking digest value', field, value)
+        # suggested in https://github.com/iipc/warc-specifications/issues/48
+        commentary.comment('Invalid-looking digest value:', field, value)
 
 
 def validate_ip(field, value, record, version, commentary, pending):
@@ -366,14 +368,14 @@ def validate_ip(field, value, record, version, commentary, pending):
             value = unicode(value)
         ipaddress.ip_address(value)
     except ValueError:
-        commentary.error('invalid ip', field, value)
+        commentary.error('invalid ip:', field, value)
     except (ImportError, NameError):  # pragma: no cover
         commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
 
 
 def validate_truncated(field, value, record, version, commentary, pending):
     if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}:
-        commentary.comment('extension seen', field, value)
+        commentary.comment('extension seen:', field, value)
 
 
 def validate_warcinfo_id(field, value, record, version, commentary, pending):
@@ -400,31 +402,31 @@ def validate_filename(field, value, record, version, commentary, pending):
 
 def validate_profile(field, value, record, version, commentary, pending):
     if version not in profiles:
-        commentary.comment('no profile check because unknown warc version', field, value)
+        commentary.comment('no profile check because unknown warc version:', field, value)
         return
     if value not in profiles[version]:
-        commentary.comment('extension seen', field, value)
+        commentary.comment('extension seen:', field, value)
 
 
 def validate_segment_number(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer', field, value)
+        commentary.error('must be an integer:', field, value)
         return
     iv = int(value)
     if iv == 0:
-        commentary.error('must be 1 or greater', field, value)
+        commentary.error('must be 1 or greater:', field, value)
 
     rec_type = record.rec_headers.get_header('WARC-Type', 'none')
     if rec_type != 'continuation':
         if iv != 1:
-            commentary.error('non-continuation records must always have WARC-Segment-Number = 1', field, value)
+            commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value)
     if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
         commentary.recommendation('do not segment WARC-Type', rec_type)
 
 
 def validate_segment_total_length(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer', field, value)
+        commentary.error('must be an integer:', field, value)
 
 
 warc_fields = {
@@ -568,21 +570,21 @@ def make_header_set(config, kinds):
 def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False):
     for req in sorted(config.get('required', [])):
         if not rec_headers.get_header(req):
-            commentary.error('missing required header', req)
+            commentary.error('missing required header:', req)
     for rec in sorted(config.get('recommended', [])):
         if not rec_headers.get_header(rec):
-            commentary.recommendation('missing recommended header', rec)
+            commentary.recommendation('missing recommended header:', rec)
     allowed = make_header_set(config, ('required', 'optional', 'recommended'))
     prohibited = make_header_set(config, ('prohibited',))
 
     for field, value in rec_headers.headers:
         fl = field.lower()
         if fl in prohibited:
-            commentary.error('field not allowed in record_type', field, rec_type)
+            commentary.error('field not allowed in record type:', rec_type, field)
         elif allow_all or fl in allowed:
             pass
         elif fl in warc_fields:
-            commentary.comment('no configuration seen for', field, rec_type)
+            commentary.comment('Unknown field for this record type, perhaps an extension:', rec_type, field)
         else:
             # an 'unknown field' comment has already been issued in validate_record
             pass
@@ -605,16 +607,16 @@ def validate_record(record):
     for field, value in record.rec_headers.headers:
         field_l = field.lower()
         if field != 'warc-concurrent-to' and field_l in seen_fields:
-            commentary.error('duplicate field seen', field, value)
+            commentary.error('duplicate field seen:', field, value)
         seen_fields.add(field_l)
         if field_l not in warc_fields:
-            commentary.comment('unknown field, no validation performed', field, value)
+            commentary.comment('unknown field, no validation performed:', field, value)
             continue
         config = warc_fields[field_l]
         if 'minver' in config:
             if version < config['minver']:
                 # unknown fields are extensions, so this is a comment and not an error
-                commentary.comment('field was introduced after this warc version', field, value, version)
+                commentary.comment('field was introduced after this warc version:', version, field, value)
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 

From 46874975664acf2ad8615511212df4edecb78d4b Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 13:15:30 -0800
Subject: [PATCH 52/68] use valid record ids

---
 .../data/standard-torture-validate-field.warc | 106 +++++++++---------
 .../standard-torture-validate-record.warc     |  32 +++---
 test/test_tests.py                            |  62 ++++------
 warcio/tester.py                              |  30 ++---
 4 files changed, 107 insertions(+), 123 deletions(-)

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
index c88d3ee6..816413be 100644
--- a/test/data/standard-torture-validate-field.warc
+++ b/test/data/standard-torture-validate-field.warc
@@ -1,53 +1,53 @@
-WARC/1.0
-WARC-Target-URI: <http://example.com/>
-WARC-Target-URI: example.com
-WARC-Target-URI: ex ample.com
-WARC-Target-URI: h<>ttp://example.com/
-WARC-Type: does-not-exist
-WARC-Type: CAPITALIZED
-WARC-Concurrent-To: http://example.com/
-WARC-Record-ID: <foo:bar>
-WARC-Date: 2017-03-06T04:03:53Z
-WARC-Date: 2017-03-06T04:03:53.Z
-Content-Type: asdf
-Content-Type: has space/asdf
-Content-Type: asdf/has space
-Content-Type: asdf/has space;asdf
-WARC-Block-Digest: asdf
-WARC-Block-Digest: has space:asdf
-WARC-Block-Digest: sha1:&$*^&*^#*&^
-WARC-IP-Address: 1.2.3.4.5
-WARC-Truncated: invalid
-WARC-Warcinfo-ID: asdf:asdf
-WARC-Filename: not-yet-tested
-WARC-Profile: asdf
-WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
-WARC-Identified-Payload-Type: asdf
-WARC-Segment-Origin-ID: http://example.com
-WARC-Segment-Number: not-an-integer
-WARC-Segment-Number: 0
-WARC-Segment-Number: 1
-WARC-Segment-Number: 2
-WARC-Segment-Total-Length: 0
-WARC-Segment-Total-Length: not-an-integer
-WARC-Refers-To-Target-URI: http://example.com
-WARC-Refers-To-Date: not-a-date
-WARC-Unknown-Field: asdf
-Content-Length: 0
-
-
-WARC/1.1
-WARC-Date: 2017-03-06T04:03:53Z
-WARC-Date: 2017-03-06T04:03:53.Z
-WARC-Date: 2017-03-06T04:03:53.0Z
-WARC-Type: invalid
-Content-Length: 0
-
-
-WARC/1.1
-WARC-Type: request
-WARC-Segment-Number: 1
-Content-Length: 0
-
-
-WARC/invalid
+WARC/1.0
+WARC-Target-URI: <http://example.com/>
+WARC-Target-URI: example.com
+WARC-Target-URI: ex ample.com
+WARC-Target-URI: h<>ttp://example.com/
+WARC-Type: does-not-exist
+WARC-Type: CAPITALIZED
+WARC-Concurrent-To: http://example.com/
+WARC-Record-ID: <urn:uuid:torture-validate-field>
+WARC-Date: 2017-03-06T04:03:53Z
+WARC-Date: 2017-03-06T04:03:53.Z
+Content-Type: asdf
+Content-Type: has space/asdf
+Content-Type: asdf/has space
+Content-Type: asdf/has space;asdf
+WARC-Block-Digest: asdf
+WARC-Block-Digest: has space:asdf
+WARC-Block-Digest: sha1:&$*^&*^#*&^
+WARC-IP-Address: 1.2.3.4.5
+WARC-Truncated: invalid
+WARC-Warcinfo-ID: asdf:asdf
+WARC-Filename: not-yet-tested
+WARC-Profile: asdf
+WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+WARC-Identified-Payload-Type: asdf
+WARC-Segment-Origin-ID: http://example.com
+WARC-Segment-Number: not-an-integer
+WARC-Segment-Number: 0
+WARC-Segment-Number: 1
+WARC-Segment-Number: 2
+WARC-Segment-Total-Length: 0
+WARC-Segment-Total-Length: not-an-integer
+WARC-Refers-To-Target-URI: http://example.com
+WARC-Refers-To-Date: not-a-date
+WARC-Unknown-Field: asdf
+Content-Length: 0
+
+
+WARC/1.1
+WARC-Date: 2017-03-06T04:03:53Z
+WARC-Date: 2017-03-06T04:03:53.Z
+WARC-Date: 2017-03-06T04:03:53.0Z
+WARC-Type: invalid
+Content-Length: 0
+
+
+WARC/1.1
+WARC-Type: request
+WARC-Segment-Number: 1
+Content-Length: 0
+
+
+WARC/invalid
diff --git a/test/data/standard-torture-validate-record.warc b/test/data/standard-torture-validate-record.warc
index fa03b38e..da6a2aaf 100644
--- a/test/data/standard-torture-validate-record.warc
+++ b/test/data/standard-torture-validate-record.warc
@@ -15,7 +15,7 @@ token cannot have a space:
 
 
 WARC/1.0
-WARC-Record-ID: test-empty-warc-fields
+WARC-Record-ID: <uri:uuid:test-empty-warc-fields>
 WARC-Type: warcinfo
 Content-Type: application/warc-fields
 Content-Length: 0
@@ -23,7 +23,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: warcinfo
-WARC-Record-ID: test-warcinfo-non-recommended-content-type
+WARC-Record-ID: <uri:uuid:test-warcinfo-non-recommended-content-type>
 Content-Type: not-application/warc-fields
 Content-Length: 5
 
@@ -32,7 +32,7 @@ foo
 
 WARC/1.0
 WARC-Type: response
-WARC-Record-ID: test-response-content-type
+WARC-Record-ID: <uri:uuid:test-response-content-type>
 WARC-Target-URI: HtTp://example.com/
 Content-Type: text/plain
 Content-Length: 0
@@ -40,7 +40,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
-WARC-Record-ID: test-resource-dns-content-type
+WARC-Record-ID: <uri:uuid:test-resource-dns-content-type>
 WARC-Target-URI: DnS:asdfasdf
 Content-Type: text/plain
 Content-Length: 0
@@ -48,7 +48,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
-WARC-Record-ID: test-resource-dns-empty
+WARC-Record-ID: <uri:uuid:test-resource-dns-empty>
 WARC-Test-TODO: add another with valid block
 WARC-Target-URI: DnS:asdfasdf
 Content-Type: text/dns
@@ -57,14 +57,14 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: resource
-WARC-Record-ID: test-resource-not-dns
+WARC-Record-ID: <uri:uuid:test-resource-not-dns>
 WARC-Target-URI: foo:bar
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: request
-WARC-Record-ID: test-request-unrecommended-content-type
+WARC-Record-ID: <uri:uuid:test-request-content-type>
 WARC-Target-URI: hTtP://example.com/
 Content-Type: text/plain
 Content-Length: 0
@@ -72,7 +72,7 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: request
-WARC-Record-ID: test-request-unrecommended-content-type-with-ip
+WARC-Record-ID: <uri:uuid:test-request-content-type-with-ip>
 WARC-Target-URI: hTtP://example.com/
 WARC-IP-Address: 1.2.3.4
 Content-Type: text/plain
@@ -81,55 +81,55 @@ Content-Length: 0
 
 WARC/1.0
 WARC-Type: metadata
-WARC-Record-ID: test-metadata-warc-fields-empty
+WARC-Record-ID: <uri:uuid:test-metadata-warc-fields-empty>
 Content-Type: application/warc-fields
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: metadata
-WARC-Record-ID: test-metadata-not-warc-fields
+WARC-Record-ID: <uri:uuid:test-metadata-not-warc-fields>
 Content-Type: not-application/warc-fields
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
-WARC-Record-ID: test-revisit-profile-unknown
+WARC-Record-ID: <uri:uuid:test-revisit-profile-unknown>
 WARC-Profile: none
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
-WARC-Record-ID: test-revisit-profile-future
+WARC-Record-ID: <uri:uuid:test-revisit-profile-future>
 WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: revisit
-WARC-Record-ID: test-revisit-profile-good
+WARC-Record-ID: <uri:uuid:test-revisit-profile-good>
 WARC-Profile: http://netpreserve.org/warc/1.0/revisit/server-not-modified
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: conversion
-WARC-Record-ID: test-conversion
+WARC-Record-ID: <uri:uuid:test-conversion>
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: continuation
-WARC-Record-ID: test-continuation-segment-1
+WARC-Record-ID: <uri:uuid:test-continuation-segment-1>
 WARC-Segment-Number: 1
 Content-Length: 0
 
 
 WARC/1.0
 WARC-Type: continuation
-WARC-Record-ID: test-continuation-segment-valid
+WARC-Record-ID: <uri:uuid:test-continuation-segment-valid>
 WARC-Segment-Number: 2
 Content-Length: 0
 
diff --git a/test/test_tests.py b/test/test_tests.py
index 91eba656..c08a19f6 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -51,80 +51,68 @@ def test_torture_validate_record():
     comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
     comment: Missing colon in warc-fields line: no colon
     comment: Invalid warc-fields name: token cannot have a space
-  WARC-Record-ID test-empty-warc-fields
+  WARC-Record-ID <uri:uuid:test-empty-warc-fields>
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-empty-warc-fields
     error: missing required header: WARC-Date
     comment: warc-fields body present but empty
-  WARC-Record-ID test-warcinfo-non-recommended-content-type
+  WARC-Record-ID <uri:uuid:test-warcinfo-non-recommended-content-type>
     WARC-Type warcinfo
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-warcinfo-non-recommended-content-type
     error: missing required header: WARC-Date
-    recommendation: warcinfo Content-Type recommended to be application/warc-fields, saw: not-application/warc-fields
-  WARC-Record-ID test-response-content-type
+    recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields
+  WARC-Record-ID <uri:uuid:test-response-content-type>
     WARC-Type response
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-response-content-type
     error: missing required header: WARC-Date
-    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw: text/plain
+    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain
     error: WARC-IP-Address should be used for http and https responses
-  WARC-Record-ID test-resource-dns-content-type
+  WARC-Record-ID <uri:uuid:test-resource-dns-content-type>
     WARC-Type resource
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-resource-dns-content-type
     error: missing required header: WARC-Date
-    error: recource records for dns: shall have Content-Type of text/dns, saw: text/plain
-  WARC-Record-ID test-resource-dns-empty
+    error: resource records for dns shall have Content-Type of text/dns: text/plain
+  WARC-Record-ID <uri:uuid:test-resource-dns-empty>
     WARC-Type resource
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-resource-dns-empty
     error: missing required header: WARC-Date
     comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block
-  WARC-Record-ID test-resource-not-dns
+  WARC-Record-ID <uri:uuid:test-resource-not-dns>
     WARC-Type resource
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-resource-not-dns
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
-  WARC-Record-ID test-request-unrecommended-content-type
+  WARC-Record-ID <uri:uuid:test-request-content-type>
     WARC-Type request
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type
     error: missing required header: WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
     error: WARC-IP-Address should be used for http and https requests
-  WARC-Record-ID test-request-unrecommended-content-type-with-ip
+  WARC-Record-ID <uri:uuid:test-request-content-type-with-ip>
     WARC-Type request
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-request-unrecommended-content-type-with-ip
     error: missing required header: WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw: text/plain
-  WARC-Record-ID test-metadata-warc-fields-empty
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
+  WARC-Record-ID <uri:uuid:test-metadata-warc-fields-empty>
     WARC-Type metadata
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-metadata-warc-fields-empty
     error: missing required header: WARC-Date
     comment: warc-fields body present but empty
-  WARC-Record-ID test-metadata-not-warc-fields
+  WARC-Record-ID <uri:uuid:test-metadata-not-warc-fields>
     WARC-Type metadata
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-metadata-not-warc-fields
     error: missing required header: WARC-Date
-  WARC-Record-ID test-revisit-profile-unknown
+  WARC-Record-ID <uri:uuid:test-revisit-profile-unknown>
     WARC-Type revisit
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-revisit-profile-unknown
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
     comment: extension seen: WARC-Profile none
     comment: no revisit details validation done due to unknown profile: none
-  WARC-Record-ID test-revisit-profile-future
+  WARC-Record-ID <uri:uuid:test-revisit-profile-future>
     WARC-Type revisit
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-revisit-profile-future
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
@@ -133,34 +121,30 @@ def test_torture_validate_record():
     recommendation: missing recommended header: WARC-Refers-To-Date
     recommendation: missing recommended header: WARC-Refers-To-Target-URI
     comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
-  WARC-Record-ID test-revisit-profile-good
+  WARC-Record-ID <uri:uuid:test-revisit-profile-good>
     WARC-Type revisit
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-revisit-profile-good
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
     recommendation: missing recommended header: WARC-Refers-To
     recommendation: missing recommended header: WARC-Refers-To-Date
-  WARC-Record-ID test-conversion
+  WARC-Record-ID <uri:uuid:test-conversion>
     WARC-Type conversion
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-conversion
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
-  WARC-Record-ID test-continuation-segment-1
+  WARC-Record-ID <uri:uuid:test-continuation-segment-1>
     WARC-Type continuation
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-continuation-segment-1
     error: missing required header: WARC-Date
     error: missing required header: WARC-Segment-Origin-ID
     error: missing required header: WARC-Target-URI
-    error: continuation record must have WARC-Segment-Number > 1, saw: 1
+    error: continuation record must have WARC-Segment-Number > 1: 1
     comment: warcio test continuation code has not been tested, expect bugs
-  WARC-Record-ID test-continuation-segment-valid
+  WARC-Record-ID <uri:uuid:test-continuation-segment-valid>
     WARC-Type continuation
     digest not present
-    error: uri must be within <>: WARC-Record-ID test-continuation-segment-valid
     error: missing required header: WARC-Date
     error: missing required header: WARC-Segment-Origin-ID
     error: missing required header: WARC-Target-URI
@@ -184,7 +168,7 @@ def test_torture_validate_field():
 
     expected = """\
 test/data/standard-torture-validate-field.warc
-  WARC-Record-ID <foo:bar>
+  WARC-Record-ID <urn:uuid:torture-validate-field>
     WARC-Type does-not-exist
     unknown hash algorithm name in block digest
     error: uri must not be within <>: WARC-Target-URI <http://example.com/>
diff --git a/warcio/tester.py b/warcio/tester.py
index 4ee05f1f..023cdb29 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -126,7 +126,7 @@ def validate_warc_fields(record, commentary):
 def validate_warcinfo(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
-        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields, saw:', content_type)
+        commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type)
     else:
         #   format: warc-fields
         #   allowable fields include but not limited to DMCI plus the following
@@ -147,7 +147,7 @@ def validate_response(record, commentary, pending):
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
         if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}:
-            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http, saw:', content_type)
+            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
@@ -164,7 +164,7 @@ def validate_resource(record, commentary, pending):
     if target_uri.startswith('dns:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
         if content_type.lower() != 'text/dns':
-            commentary.error('recource records for dns: shall have Content-Type of text/dns, saw:', content_type)
+            commentary.error('resource records for dns shall have Content-Type of text/dns:', content_type)
         else:
             # rfc 2540 and rfc 1035
             #validate_text_dns()
@@ -180,7 +180,7 @@ def validate_request(record, commentary, pending):
         content_type = record.rec_headers.get_header('Content-Type')
 
         if canon_content_type(content_type) not in {'application/http; msgtype=request', 'application/http'}:
-            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http, saw:', content_type)
+            commentary.error('requests for http/https should have Content-Type of application/http; msgtype=request or application/http:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https requests')
@@ -240,12 +240,12 @@ def validate_continuation(record, commentary, pending):
 
     segment_number = record.rec_headers.get_header('WARC-Segment-Number', 'none')
     if segment_number.isdigit() and int(segment_number) < 2:
-        commentary.error('continuation record must have WARC-Segment-Number > 1, saw:', segment_number)
+        commentary.error('continuation record must have WARC-Segment-Number > 1:', segment_number)
 
     # last segment: required WARC-Segment-Total-Length, optional WARC-Truncated
 
 
-def validate_actual_uri(field, value, record, version, commentary, pending):
+def validate_unbracketed_uri(field, value, record, version, commentary, pending):
     # uri per RFC 3986
     # should use a registered scheme
     # %XX encoding, normalize to upper case
@@ -272,16 +272,16 @@ def validate_warc_type(field, value, record, version, commentary, pending):
         commentary.comment('unknown WARC-Type:', field, value)
 
 
-def validate_uri(field, value, record, version, commentary, pending):
+def validate_bracketed_uri(field, value, record, version, commentary, pending):
     # < uri >
     if not (value.startswith('<') and value.endswith('>')):
         commentary.error('uri must be within <>:', field, value)
         return
-    validate_actual_uri(field, value[1:-1], record, version, commentary, pending)
+    validate_unbracketed_uri(field, value[1:-1], record, version, commentary, pending)
 
 
 def validate_record_id(field, value, record, version, commentary, pending):
-    validate_uri(field, value, record, version, commentary, pending)
+    validate_bracketed_uri(field, value, record, version, commentary, pending)
     # TODO: should be "globally unique for its period of intended use"
 
 
@@ -379,7 +379,7 @@ def validate_truncated(field, value, record, version, commentary, pending):
 
 
 def validate_warcinfo_id(field, value, record, version, commentary, pending):
-    validate_uri(field, value, record, version, commentary, pending)
+    validate_bracketed_uri(field, value, record, version, commentary, pending)
     # TODO: should point at a warcinfo record
 
 
@@ -446,7 +446,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_content_type,
     },
     'WARC-Concurrent-To': {
-        'validate': validate_uri,
+        'validate': validate_bracketed_uri,
     },
     'WARC-Block-Digest': {
         'validate': validate_digest,
@@ -458,10 +458,10 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_ip,
     },
     'WARC-Refers-To': {
-        'validate': validate_uri,
+        'validate': validate_bracketed_uri,
     },
     'WARC-Target-URI': {
-        'validate': validate_actual_uri,
+        'validate': validate_unbracketed_uri,
     },
     'WARC-Truncated': {
         'validate': validate_truncated,
@@ -479,7 +479,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_content_type,
     },
     'WARC-Segment-Origin-ID': {
-        'validate': validate_uri,
+        'validate': validate_bracketed_uri,
     },
     'WARC-Segment-Number': {
         'validate': validate_segment_number,
@@ -488,7 +488,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_segment_total_length,
     },
     'WARC-Refers-To-Target-URI': {
-        'validate': validate_actual_uri,
+        'validate': validate_unbracketed_uri,
         'minver': '1.1',
     },
     'WARC-Refers-To-Date': {

From bcfe672f26506bcf9b070834c2fb6842f5d1028c Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 13:31:40 -0800
Subject: [PATCH 53/68] warc-segment-number cleaner recommendation

---
 test/test_tests.py |  1 -
 warcio/tester.py   | 12 ++++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index c08a19f6..dcbc3666 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -237,7 +237,6 @@ def test_torture_validate_field():
     error: missing required header: WARC-Record-ID
     error: missing required header: WARC-Target-URI
     recommendation: do not segment WARC-Type request
-    comment: Unknown field for this record type, perhaps an extension: request WARC-Segment-Number
 """
 
     value = helper(args, 0)
diff --git a/warcio/tester.py b/warcio/tester.py
index 023cdb29..6346754d 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -503,20 +503,21 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'],
         'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'],
+        'ignored': ['WARC-Segment-Number'],
         'validate': validate_warcinfo,
     },
     'response': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'Content-Type', 'WARC-Target-URI'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_response,
     },
     'resource': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_resource,
     },
@@ -526,6 +527,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'ignored': ['WARC-Segment-Number'],
         'validate': validate_request,
     },
     'metadata': {
@@ -534,6 +536,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'],
         'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'ignored': ['WARC-Segment-Number'],
         'validate': validate_metadata,
     },
     'revisit': {
@@ -542,11 +545,12 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID',  # normal optionals
                      'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],  # these are for profiles
         'prohibited': ['WARC-Filename'],
+        'ignored': ['WARC-Segment-Number'],
         'validate': validate_revisit,
     },
     'conversion': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'],
-        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number'],
         'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_conversion,
     },
@@ -574,7 +578,7 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary,
     for rec in sorted(config.get('recommended', [])):
         if not rec_headers.get_header(rec):
             commentary.recommendation('missing recommended header:', rec)
-    allowed = make_header_set(config, ('required', 'optional', 'recommended'))
+    allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored'))
     prohibited = make_header_set(config, ('prohibited',))
 
     for field, value in rec_headers.headers:

From 7f715c055c25f2a8486555196dc683ba59a0220d Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 13:55:54 -0800
Subject: [PATCH 54/68] segment origin id

---
 test/test_tests.py |  1 +
 warcio/tester.py   | 23 +++++++++++++----------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index dcbc3666..598ba49b 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -232,6 +232,7 @@ def test_torture_validate_field():
   WARC-Record-ID None
     WARC-Type request
     digest not present
+    error: segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Record-ID
diff --git a/warcio/tester.py b/warcio/tester.py
index 6346754d..632de060 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -420,6 +420,9 @@ def validate_segment_number(field, value, record, version, commentary, pending):
     if rec_type != 'continuation':
         if iv != 1:
             commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value)
+        origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID')
+        if origin_id is None:
+            commentary.error('segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID')
     if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
         commentary.recommendation('do not segment WARC-Type', rec_type)
 
@@ -503,21 +506,21 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'Content-Type'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Filename', 'WARC-Truncated'],
         'prohibited': ['WARC-Refers-To', 'WARC-Profile', 'WARC-Identified-Payload-Type'],
-        'ignored': ['WARC-Segment-Number'],
+        'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'validate': validate_warcinfo,
     },
     'response': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'Content-Type', 'WARC-Target-URI'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_response,
     },
     'resource': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI', 'Content-Type'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_resource,
     },
@@ -527,7 +530,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
-        'ignored': ['WARC-Segment-Number'],
+        'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'validate': validate_request,
     },
     'metadata': {
@@ -536,7 +539,7 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-IP-Address', 'WARC-Truncated',
                      'WARC-Concurrent-To', 'WARC-Refers-To', 'WARC-Target-URI', 'WARC-Warcinfo-ID'],
         'prohibited': ['WARC-Payload-Digest', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
-        'ignored': ['WARC-Segment-Number'],
+        'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'validate': validate_metadata,
     },
     'revisit': {
@@ -545,18 +548,18 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'optional': ['WARC-Block-Digest', 'WARC-Truncated', 'WARC-IP-Address', 'WARC-Warcinfo-ID',  # normal optionals
                      'WARC-Payload-Digest', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],  # these are for profiles
         'prohibited': ['WARC-Filename'],
-        'ignored': ['WARC-Segment-Number'],
+        'ignored': ['WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'validate': validate_revisit,
     },
     'conversion': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'],
-        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_conversion,
     },
     'continuation': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
-                     'WARC-Segment-Origin-ID', 'WARC-Segment-Number', 'WARC-Target-URI'],
+                     'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'],
         'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'],
         'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_continuation,
@@ -587,8 +590,8 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary,
             commentary.error('field not allowed in record type:', rec_type, field)
         elif allow_all or fl in allowed:
             pass
-        elif fl in warc_fields:
-            commentary.comment('Unknown field for this record type, perhaps an extension:', rec_type, field)
+        elif fl in warc_fields:  # pragma: no cover (this is a configuration error, if it happens)
+            commentary.comment('Known field, but not expected for this record type:', rec_type, field)
         else:
             # an 'unknown field' comment has already been issued in validate_record
             pass

From 2583f19c762037a66847e2a4087c16082a06fbfc Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 14:51:19 -0800
Subject: [PATCH 55/68] timestamp checking

---
 test/test_tests.py |  6 ++++--
 warcio/tester.py   | 21 ++++++++-------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index 598ba49b..89851eca 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -184,7 +184,8 @@ def test_torture_validate_field():
     error: duplicate field seen: WARC-Type CAPITALIZED
     error: uri must be within <>: WARC-Concurrent-To http://example.com/
     error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
-    error: WARC 1.0 time may not have fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
+    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
+    error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
     error: must contain a /: Content-Type asdf
     error: invalid subtype: Content-Type asdf
     error: duplicate field seen: Content-Type has space/asdf
@@ -212,6 +213,7 @@ def test_torture_validate_field():
     error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2
     error: duplicate field seen: WARC-Segment-Total-Length not-an-integer
     error: must be an integer: WARC-Segment-Total-Length not-an-integer
+    error: Invalid timestamp: WARC-Refers-To-Date not-a-date
     comment: unknown WARC-Type: WARC-Type does-not-exist
     comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED
     comment: unknown WARC-Type: WARC-Type CAPITALIZED
@@ -226,7 +228,7 @@ def test_torture_validate_field():
     WARC-Type invalid
     digest not present
     error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
-    error: fractional seconds must have 1-9 digits: WARC-Date 2017-03-06T04:03:53.Z
+    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
     error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z
     comment: unknown WARC-Type: WARC-Type invalid
   WARC-Record-ID None
diff --git a/warcio/tester.py b/warcio/tester.py
index 632de060..5396ff3b 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -286,21 +286,16 @@ def validate_record_id(field, value, record, version, commentary, pending):
 
 
 def validate_timestamp(field, value, record, version, commentary, pending):
-    use_ms = False if version == '1.0' else True
-    if not use_ms:
-        if '.' in value:
-            # XXX specification infelicity: would be nice to have 'advice to implementers' here
-            commentary.error('WARC 1.0 time may not have fractional seconds:', field, value)
-    else:
-        if '.' in value:
-            start, end = value.split('.', 1)
-            if not re.search(r'\A[0-9]{1,9}Z\Z', end):
-                commentary.error('fractional seconds must have 1-9 digits:', field, value)
+    ISO_RE = r'\A\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:.\d{1,9})?Z\Z'
 
-    # XXX the above is pretty incomplete for dash, colon, trailing Z, etc
+    if not re.match(ISO_RE, value):
+        commentary.error('Invalid timestamp:', field, value)
 
-    # TODO: "multiple records written as part of a single capture event shall use the same WARC-Date"
-    # how? follow WARC-Concurrent-To pointer(s) from request to response(s)
+    use_ms = False if version <= '1.0' else True
+    if not use_ms:
+        if '.' in value:
+            # specification infelicity: would be nice to have 'advice to implementers' here
+            commentary.error('WARC versions <= 1.0 may not have timestamps with fractional seconds:', field, value)
 
 
 def validate_content_length(field, value, record, version, commentary, pending):

From 8eb87e845f0b0a76cb0f3dd035bbdb52c19f6bc0 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 28 Jan 2019 16:46:59 -0800
Subject: [PATCH 56/68] buglet

---
 test/data/standard-torture-validate-field.warc | 1 +
 warcio/tester.py                               | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
index 816413be..126ba964 100644
--- a/test/data/standard-torture-validate-field.warc
+++ b/test/data/standard-torture-validate-field.warc
@@ -6,6 +6,7 @@ WARC-Target-URI: h<>ttp://example.com/
 WARC-Type: does-not-exist
 WARC-Type: CAPITALIZED
 WARC-Concurrent-To: http://example.com/
+WARC-Concurrent-To: <uri:urn:asdf-asdf-asdf>
 WARC-Record-ID: <urn:uuid:torture-validate-field>
 WARC-Date: 2017-03-06T04:03:53Z
 WARC-Date: 2017-03-06T04:03:53.Z
diff --git a/warcio/tester.py b/warcio/tester.py
index 5396ff3b..8e9d8da3 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -608,7 +608,7 @@ def validate_record(record):
     seen_fields = set()
     for field, value in record.rec_headers.headers:
         field_l = field.lower()
-        if field != 'warc-concurrent-to' and field_l in seen_fields:
+        if field_l != 'warc-concurrent-to' and field_l in seen_fields:
             commentary.error('duplicate field seen:', field, value)
         seen_fields.add(field_l)
         if field_l not in warc_fields:

From 3a8747e04641e4e4030981dc36a7898b686060f9 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Tue, 29 Jan 2019 17:52:05 -0800
Subject: [PATCH 57/68] global checks

---
 .../data/standard-torture-validate-field.warc |   2 +
 test/test_tests.py                            |  51 +++-
 warcio/tester.py                              | 278 +++++++++++++++---
 3 files changed, 276 insertions(+), 55 deletions(-)

diff --git a/test/data/standard-torture-validate-field.warc b/test/data/standard-torture-validate-field.warc
index 126ba964..a928a4c4 100644
--- a/test/data/standard-torture-validate-field.warc
+++ b/test/data/standard-torture-validate-field.warc
@@ -33,6 +33,8 @@ WARC-Segment-Total-Length: 0
 WARC-Segment-Total-Length: not-an-integer
 WARC-Refers-To-Target-URI: http://example.com
 WARC-Refers-To-Date: not-a-date
+WARC-Refers-To-Filename: asdf
+WARC-Refers-To-File-Offset: 1234
 WARC-Unknown-Field: asdf
 Content-Length: 0
 
diff --git a/test/test_tests.py b/test/test_tests.py
index 89851eca..ebbdb509 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -6,6 +6,14 @@
 from .test_cli import patch_stdout
 
 
+file_map = {}
+
+
+def map_test_file(filename):
+    file_map[filename] = get_test_file(filename)
+    return file_map[filename]
+
+
 def helper(args, expected_exit_value):
     with patch_stdout() as buff:
         exit_value = None
@@ -22,17 +30,16 @@ def helper(args, expected_exit_value):
 def remove_before_test_data(s):
     ret = ''
     for line in s.splitlines(True):
-        if '/test/data/' in line:
-            line = 'test/data/' + line.split('/test/data/', 1)[1]
-        if '\\test\\data\\' in line:
-            line = 'test/data/' + line.split('\\test\\data\\', 1)[1]
+        for filename, value in file_map.items():
+            if value in line:
+                line = line.replace(value, 'test/data/' + filename)
         ret += line
     return ret
 
 
 def test_torture_validate_record():
     files = ['standard-torture-validate-record.warc']
-    files = [get_test_file(filename) for filename in files]
+    files = [map_test_file(filename) for filename in files]
 
     args = ['test']
     args.extend(files)
@@ -55,7 +62,7 @@ def test_torture_validate_record():
     WARC-Type warcinfo
     digest not present
     error: missing required header: WARC-Date
-    comment: warc-fields body present but empty
+    comment: warc-fields block present but empty
   WARC-Record-ID <uri:uuid:test-warcinfo-non-recommended-content-type>
     WARC-Type warcinfo
     digest not present
@@ -67,6 +74,7 @@ def test_torture_validate_record():
     error: missing required header: WARC-Date
     error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain
     error: WARC-IP-Address should be used for http and https responses
+    error: http/https responses should have http headers
   WARC-Record-ID <uri:uuid:test-resource-dns-content-type>
     WARC-Type resource
     digest not present
@@ -97,7 +105,7 @@ def test_torture_validate_record():
     WARC-Type metadata
     digest not present
     error: missing required header: WARC-Date
-    comment: warc-fields body present but empty
+    comment: warc-fields block present but empty
   WARC-Record-ID <uri:uuid:test-metadata-not-warc-fields>
     WARC-Type metadata
     digest not present
@@ -108,7 +116,7 @@ def test_torture_validate_record():
     error: missing required header: Content-Type
     error: missing required header: WARC-Date
     error: missing required header: WARC-Target-URI
-    comment: extension seen: WARC-Profile none
+    comment: unknown value, perhaps an extension: WARC-Profile none
     comment: no revisit details validation done due to unknown profile: none
   WARC-Record-ID <uri:uuid:test-revisit-profile-future>
     WARC-Type revisit
@@ -120,7 +128,7 @@ def test_torture_validate_record():
     recommendation: missing recommended header: WARC-Refers-To
     recommendation: missing recommended header: WARC-Refers-To-Date
     recommendation: missing recommended header: WARC-Refers-To-Target-URI
-    comment: extension seen: WARC-Profile http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+    comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
   WARC-Record-ID <uri:uuid:test-revisit-profile-good>
     WARC-Type revisit
     digest not present
@@ -161,7 +169,7 @@ def test_torture_validate_record():
 
 def test_torture_validate_field():
     files = ['standard-torture-validate-field.warc']
-    files = [get_test_file(filename) for filename in files]
+    files = [map_test_file(filename) for filename in files]
 
     args = ['test']
     args.extend(files)
@@ -219,10 +227,12 @@ def test_torture_validate_field():
     comment: unknown WARC-Type: WARC-Type CAPITALIZED
     comment: unknown digest algorithm: WARC-Block-Digest asdf
     comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^
-    comment: extension seen: WARC-Truncated invalid
-    comment: extension seen: WARC-Profile asdf
+    comment: unknown value, perhaps an extension: WARC-Truncated invalid
+    comment: unknown value, perhaps an extension: WARC-Profile asdf
     comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com
     comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date
+    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf
+    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234
     comment: unknown field, no validation performed: WARC-Unknown-Field asdf
   WARC-Record-ID None
     WARC-Type invalid
@@ -240,6 +250,11 @@ def test_torture_validate_field():
     error: missing required header: WARC-Record-ID
     error: missing required header: WARC-Target-URI
     recommendation: do not segment WARC-Type request
+global warcinfo checks
+  comment: WARC-Warcinfo-ID not found: <urn:uuid:torture-validate-field> WARC-Warcinfo-ID asdf:asdf
+global Concurrent-To checks
+  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To <uri:urn:asdf-asdf-asdf>
+  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To http://example.com/
 """
 
     value = helper(args, 0)
@@ -251,7 +266,7 @@ def test_torture_validate_field():
 
 def test_arc():
     files = ['does-not-exist.arc']
-    files = [get_test_file(filename) for filename in files]
+    files = [map_test_file(filename) for filename in files]
 
     args = ['test']
     args.extend(files)
@@ -267,7 +282,7 @@ def test_arc():
 def test_digests():
     # needed for test coverage
     files = ['example-digest-bad.warc', 'example.warc']
-    files = [get_test_file(filename) for filename in files]
+    files = [map_test_file(filename) for filename in files]
 
     args = ['test']
     args.extend(files)
@@ -282,23 +297,28 @@ def test_digests():
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
 test/data/example.warc
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest not present
     error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example.warc test/data/example-digest-bad.warc
   WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
     WARC-Type revisit
     digest present but not checked
     recommendation: missing recommended header: WARC-Refers-To
+    comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest
     comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
     comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
   WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
@@ -318,12 +338,11 @@ def test_leftovers():
     # hard to test because invalid WARC Content-Length raises in archiveiterator
     warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None)
 
-    # hard to test because warcio checks the WARC version
+    # hard to test because warcio raises for unknown WARC version
     warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
 
     expected = '''\
 error: must be an integer: Content-Length not-an-integer
-comment: no profile check because unknown warc version: blah blah
 '''
 
     assert '\n'.join(commentary.comments())+'\n' == expected
diff --git a/warcio/tester.py b/warcio/tester.py
index 8e9d8da3..870c7d6e 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -3,14 +3,15 @@
 import re
 import sys
 import six
+from collections import defaultdict
 
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
 from warcio.exceptions import ArchiveLoadFailed
 
 
-class Commentary:
-    def __init__(self, record_id, rec_type):
+class Commentary(object):
+    def __init__(self, record_id=None, rec_type=None):
         self._record_id = record_id
         self._rec_type = rec_type
         self.errors = []
@@ -37,6 +38,7 @@ def has_comments(self):
             return True
 
     def comments(self):
+        # XXX str() all of these, in case an int or other thing slips in?
         for e in self.errors:
             yield 'error: ' + ' '.join(e)
         for r in self.recommendations:
@@ -55,6 +57,13 @@ def __getattr__(self, name):
             if self._content is None:
                 self._content = self.obj.content_stream().read()
             return self._content
+        if name == 'stream_for_digest_check':
+            def _doit():
+                while True:
+                    piece = self.obj.content_stream().read(1024*1024)
+                    if len(piece) == 0:
+                        break
+            return _doit
         return getattr(self.__dict__['obj'], name)
 
 
@@ -117,7 +126,7 @@ def validate_warc_fields(record, commentary):
         first_line = False
 
     if not lines:
-        commentary.comment('warc-fields body present but empty')
+        commentary.comment('warc-fields block present but empty')
         return
 
     # check known fields
@@ -126,6 +135,7 @@ def validate_warc_fields(record, commentary):
 def validate_warcinfo(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() != 'application/warc-fields':
+        # https://github.com/iipc/warc-specifications/issues/33 -- SHALL BE or recommended?
         commentary.recommendation('warcinfo Content-Type recommended to be application/warc-fields:', content_type)
     else:
         #   format: warc-fields
@@ -137,8 +147,8 @@ def validate_warcinfo(record, commentary, pending):
         validate_warc_fields(record, commentary)
 
     # whole-file tests:
-    # optional that warcinfo be first in file, still deserves a comment
-    # allowable for warcinfo to appear anywhere
+    # recommended that all files start with warcinfo
+    # elsewise allowable for warcinfo to appear anywhere
 
 
 def validate_response(record, commentary, pending):
@@ -152,10 +162,32 @@ def validate_response(record, commentary, pending):
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
 
-        # error: http and https schemes should have http response headers
-        #   test by attempting to parse them?
+        if not record.http_headers:
+            commentary.error('http/https responses should have http headers')
+            return
 
-        # comment: verify http content-length, if present -- commoncrawl nutch bug
+        http_content_length = record.http_headers.get_header('Content-Length')
+        if http_content_length is None:
+            return
+
+        if not http_content_length.isdigit():
+            commentary.comment('http content length header is not an integer', str(http_content_length))
+            return
+
+        # We want to verify http_content_length, which is the size of the compressed payload
+        # Trying to catch that commoncrawl nutch bug that prefixed /r/n to the payload without changing http content-length
+
+        # this blecherous hack is because we need the length of the (possibly compressed) raw stream
+        # without reading any of it (so that it can be read elsewhere to check the payload digest)
+
+        # XXX fix me before shipping :-D
+
+        if hasattr(record, 'raw_stream'):
+            if hasattr(record.raw_stream, 'stream'):
+                if hasattr(record.raw_stream.stream, 'limit'):
+                    if int(http_content_length) != record.raw_stream.stream.limit:
+                        commentary.comment('Actual http payload length is different from http header Content-Length:',
+                                           str(record.raw_stream.stream.limit), http_content_length)
 
 
 def validate_resource(record, commentary, pending):
@@ -171,6 +203,7 @@ def validate_resource(record, commentary, pending):
             pass
 
     # should never have http headers
+    #   heuristic of looking for an http status line? and then a blank line?!
 
 
 def validate_request(record, commentary, pending):
@@ -193,6 +226,8 @@ def validate_request(record, commentary, pending):
 def validate_metadata(record, commentary, pending):
     content_type = record.rec_headers.get_header('Content-Type', 'none')
     if content_type.lower() == 'application/warc-fields':
+        # https://github.com/iipc/warc-specifications/issues/33 SHALL be or not?
+        #
         # dublin core plus via, hopsFromSeed, fetchTimeMs -- w1.1 section 6
         # via: uri -- example in Warc 1.1 section 10.5 does not have <> around it
         # hopsFromSeed: string
@@ -206,8 +241,11 @@ def validate_revisit(record, commentary, pending):
     if warc_profile.endswith('/revisit/identical-payload-digest') or warc_profile.endswith('/revisit/uri-agnostic-identical-payload-digest'):
         config = {
             'required': ['WARC-Payload-Digest'],
-            'recommended': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'],
+            'recommended': ['WARC-Refers-To'],
         }
+        if '/1.1/' in warc_profile:
+            config['recommended'].extend(('WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date'))
+
         validate_fields_against_rec_type('revisit', config, record.rec_headers, commentary, allow_all=True)
         # may have record block;
         #  if not, shall have Content-Length: 0,
@@ -282,7 +320,6 @@ def validate_bracketed_uri(field, value, record, version, commentary, pending):
 
 def validate_record_id(field, value, record, version, commentary, pending):
     validate_bracketed_uri(field, value, record, version, commentary, pending)
-    # TODO: should be "globally unique for its period of intended use"
 
 
 def validate_timestamp(field, value, record, version, commentary, pending):
@@ -328,8 +365,6 @@ def validate_content_type(field, value, record, version, commentary, pending):
     # at this point there can be multiple parameters,
     # some of which could have quoted string values with ; in them
 
-    # TODO: more checking
-
 
 def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
@@ -370,37 +405,45 @@ def validate_ip(field, value, record, version, commentary, pending):
 
 def validate_truncated(field, value, record, version, commentary, pending):
     if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}:
-        commentary.comment('extension seen:', field, value)
+        commentary.comment('unknown value, perhaps an extension:', field, value)
 
 
 def validate_warcinfo_id(field, value, record, version, commentary, pending):
     validate_bracketed_uri(field, value, record, version, commentary, pending)
-    # TODO: should point at a warcinfo record
 
 
 def validate_filename(field, value, record, version, commentary, pending):
-    # TODO: text or quoted-string
+    # text or quoted-string
+    # comment for dangerous utf-8 in filename?
     pass
 
 
 profiles = {
-    # XXX WARC/0.17 and WARC/0.18
+    '0.17': ['http://netpreserve.org/warc/0.17/revisit/identical-payload-digest',
+             'http://netpreserve.org/warc/0.17/revisit/server-not-modified'],
+    '0.18': ['http://netpreserve.org/warc/0.18/revisit/identical-payload-digest',
+             'http://netpreserve.org/warc/0.18/revisit/server-not-modified'],
     '1.0': ['http://netpreserve.org/warc/1.0/revisit/identical-payload-digest',
             'http://netpreserve.org/warc/1.0/revisit/server-not-modified',
-            # the following removed from iipc/webarchive-commons in may 2017; common in the wild TODO comment or not?
-            # https://github.com/iipc/webarchive-commons/commits/988bec707c27a01333becfc3bd502af4441ea1e1/src/main/java/org/archive/format/warc/WARCConstants.java
             'http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest'],
     '1.1': ['http://netpreserve.org/warc/1.1/revisit/identical-payload-digest',
             'http://netpreserve.org/warc/1.1/revisit/server-not-modified'],
 }
+profiles_rev = dict([(filename, version) for version, filenames in profiles.items() for filename in filenames])
 
 
 def validate_profile(field, value, record, version, commentary, pending):
     if version not in profiles:
-        commentary.comment('no profile check because unknown warc version:', field, value)
         return
-    if value not in profiles[version]:
-        commentary.comment('extension seen:', field, value)
+
+    if value in profiles_rev:
+        if profiles_rev[value] != version:
+            commentary.comment('WARC-Profile value is for a different version:', version, value)
+    else:
+        commentary.comment('unknown value, perhaps an extension:', field, value)
+
+    if '/revisit/uri-agnostic-identical-payload-digest' in value:
+        commentary.comment('This Heretrix extension never made it into the standard:', field, value)
 
 
 def validate_segment_number(field, value, record, version, commentary, pending):
@@ -427,6 +470,14 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         commentary.error('must be an integer:', field, value)
 
 
+def validate_refers_to_filename(field, value, record, version, commentary, pending):
+    commentary.comment('This Heretrix extension never made it into the standard:', field, value)
+
+
+def validate_refers_to_file_offset(field, value, record, version, commentary, pending):
+    commentary.comment('This Heretrix extension never made it into the standard:', field, value)
+
+
 warc_fields = {
     'WARC-Type': {
         'validate': validate_warc_type,
@@ -493,6 +544,12 @@ def validate_segment_total_length(field, value, record, version, commentary, pen
         'validate': validate_timestamp,
         'minver': '1.1',
     },
+    'WARC-Refers-To-Filename': {
+        'validate': validate_refers_to_filename,
+    },
+    'WARC-Refers-To-File-Offset': {
+        'validate': validate_refers_to_file_offset,
+    },
 }
 warc_fields = dict([(k.lower(), v) for k, v in warc_fields.items()])
 
@@ -579,13 +636,13 @@ def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary,
     allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored'))
     prohibited = make_header_set(config, ('prohibited',))
 
-    for field, value in rec_headers.headers:
+    for field, value in rec_headers.headers:  # XXX not exported
         fl = field.lower()
         if fl in prohibited:
             commentary.error('field not allowed in record type:', rec_type, field)
         elif allow_all or fl in allowed:
             pass
-        elif fl in warc_fields:  # pragma: no cover (this is a configuration error, if it happens)
+        elif fl in warc_fields:  # pragma: no cover (this is a tester.py configuration omission)
             commentary.comment('Known field, but not expected for this record type:', rec_type, field)
         else:
             # an 'unknown field' comment has already been issued in validate_record
@@ -598,15 +655,15 @@ def validate_record_against_rec_type(config, record, commentary, pending):
 
 
 def validate_record(record):
-    version = record.rec_headers.protocol.split('/', 1)[1]  # XXX not exported?
+    version = record.rec_headers.protocol.split('/', 1)[1]  # XXX not exported
 
     record_id = record.rec_headers.get_header('WARC-Record-ID')
     rec_type = record.rec_headers.get_header('WARC-Type')
-    commentary = Commentary(record_id, rec_type)
+    commentary = Commentary(record_id=record_id, rec_type=rec_type)
     pending = None
 
     seen_fields = set()
-    for field, value in record.rec_headers.headers:
+    for field, value in record.rec_headers.headers:  # XXX not exported
         field_l = field.lower()
         if field_l != 'warc-concurrent-to' and field_l in seen_fields:
             commentary.error('duplicate field seen:', field, value)
@@ -617,13 +674,13 @@ def validate_record(record):
         config = warc_fields[field_l]
         if 'minver' in config:
             if version < config['minver']:
-                # unknown fields are extensions, so this is a comment and not an error
                 commentary.comment('field was introduced after this warc version:', version, field, value)
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 
     if rec_type not in record_types:
-        pass  # we print a comment for this elsewhere
+        # we print a comment for this elsewhere
+        pass
     else:
         validate_fields_against_rec_type(rec_type, record_types[rec_type], record.rec_headers, commentary)
         validate_record_against_rec_type(record_types[rec_type], record, commentary, pending)
@@ -631,10 +688,149 @@ def validate_record(record):
     return commentary
 
 
-def _process_one(warc):
-    if warc.endswith('.arc') or warc.endswith('.arc.gz'):
+def save_global_info(record, warcfile, commentary, all_records, concurrent_to):
+    record_id = record.rec_headers.get_header('WARC-Record-ID')
+    if record_id is None:
         return
-    with open(warc, 'rb') as stream:
+
+    for field, value in record.rec_headers.headers:  # XXX not exported
+        if field.lower() == 'warc-concurrent-to':
+            if record_id is not None and value is not None:
+                concurrent_to[record_id].append(value)
+                concurrent_to[value].append(record_id)
+
+    save = {'warcfile': warcfile}
+
+    saved_fields = (
+        'WARC-Type', 'WARC-Warcinfo-ID', 'WARC-Date'
+        'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Payload-Digest', 'WARC-Target-URI',
+        'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Segment-Total-Length', 'WARC-Truncated'
+    )
+    saved_fields = set([x.lower() for x in saved_fields])
+
+    for field, value in record.rec_headers.headers:  # XXX not exported
+        field_l = field.lower()
+        if field_l in saved_fields and value is not None:
+            save[field_l] = value
+        if field_l == 'warc-concurrent-to':
+            if 'warc-concurrent-to' not in save:
+                save['warc-concurrent-to'] = []
+            save['warc-concurrent-to'].append(value)
+
+    if record_id in all_records:
+        commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile'])
+    else:
+        all_records[record_id] = save
+
+
+def check_global(all_records, concurrent_to):
+    check_global_warcinfo(all_records)
+    check_global_concurrent_to(all_records, concurrent_to)
+    check_global_refers_to(all_records)
+    check_global_segment(all_records)
+
+
+def _print_global(header, commentary):
+    if commentary.has_comments():
+        print(header)
+        for c in commentary.comments():
+            print(' ', c)
+
+
+def check_global_warcinfo(all_records):
+    commentary = Commentary()
+    for record_id, fields in all_records.items():
+        if 'warc-warcinfo-id' in fields:
+            wanted_id = fields['warc-warcinfo-id']
+            if wanted_id not in all_records or all_records[wanted_id]['warc-type'] != 'warcinfo':
+                commentary.comment('WARC-Warcinfo-ID not found:', record_id, 'WARC-Warcinfo-ID', wanted_id)
+
+    _print_global('global warcinfo checks', commentary)
+
+
+def check_global_concurrent_to(all_records, concurrent_to):
+    commentary = Commentary()
+    for record_id, fields in all_records.items():
+        if 'warc-concurrent-to' in fields:
+            whole_set = set(fields['warc-concurrent-to'])
+            del fields['warc-concurrent-to']
+            while True:
+                current_set = list(whole_set)
+                for c in current_set:
+                    if c in all_records and 'warc-concurrent-to' in all_records[c]:
+                        whole_set.update(set(all_records[c]['warc-concurrent-to']))
+                        del all_records[c]['warc-concurrent-to']
+                if len(whole_set) == len(current_set):
+                    break
+            warc_date = fields.get('warc-date')
+            for wanted_id in sorted(whole_set):
+                if wanted_id not in all_records:
+                    commentary.comment('WARC-Concurrent-To not found:', record_id, 'WARC-Concurrent-To', wanted_id)
+                else:
+                    new_date = all_records[wanted_id].get('warc-date')
+                    if warc_date != new_date:
+                        commentary.comment('WARC-Concurrent-To set has conflicting dates:',
+                                           record_id, warc_date, wanted_id, new_date)
+
+    _print_global('global Concurrent-To checks', commentary)
+
+
+def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, target_field, commentary):
+    if source_field.lower() not in fields:
+        return
+
+    if target_field.lower() not in all_records[wanted_id]:
+        commentary.comment('revisit target lacks field:', wanted_id, target_field)
+        return
+
+    source_value = fields[source_field.lower()]
+    target_value = all_records[wanted_id][target_field.lower()]
+    if source_value != target_value:
+        commentary.comment('revisit and revisit target disagree:',
+                           record_id, source_field, source_value,
+                           wanted_id, target_field, target_value)
+
+
+def check_global_refers_to(all_records):
+    commentary = Commentary()
+    for record_id, fields in all_records.items():
+        if 'warc-refers-to' not in fields:
+            continue
+
+        wanted_id = fields['warc-refers-to']
+        if wanted_id not in all_records:
+            commentary.comment('WARC-Refers-To target not found:', record_id, 'Warc-Refers-To', wanted_id)
+            continue
+
+        rec_type = fields.get('warc-type')
+        if rec_type != 'revisit':
+            continue
+
+        _revisit_compare(record_id, fields, 'WARC-Refers-To-Target-URI',
+                         wanted_id, all_records, 'WARC-Target-URI', commentary)
+        _revisit_compare(record_id, fields, 'WARC-Refers-To-Date',
+                         wanted_id, all_records, 'WARC-Date', commentary)
+        _revisit_compare(record_id, fields, 'WARC-Payload-Digest',
+                         wanted_id, all_records, 'WARC-Payload-Digest', commentary)
+
+    _print_global('global Refers-To checks', commentary)
+
+
+def check_global_segment(all_records):
+    # warc-segment-origin-id :: exists, is warc-segment-number 1
+    #   all segments exist, and the last one has WARC-Segment-Total-Length
+    #   and only the last one has WARC-Truncated, if any
+
+    # Segmentation shall not be used if a record can be stored in an existing warc file
+    # The origin segment shall be placed in a new warc file preceded only by a warcinfo record (if any)
+
+    pass
+
+
+def _process_one(warcfile, all_records, concurrent_to):
+    if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'):
+        return
+    with open(warcfile, 'rb') as stream:
         for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
 
             record = WrapRecord(record)
@@ -642,10 +838,9 @@ def _process_one(warc):
                               record.rec_headers.get_header('WARC-Block-Digest'))
 
             commentary = validate_record(record)
+            save_global_info(record, warcfile, commentary, all_records, concurrent_to)
 
-            record.content  # make sure digests are checked
-            # XXX might need to read and digest the raw stream to check digests for chunked encoding?
-            # XXX chunked lacks Content-Length and presumably the digest needs to be computed on the non-chunked bytes
+            record.stream_for_digest_check()
 
             if commentary.has_comments() or record.digest_checker.passed is False:
                 print(' ', 'WARC-Record-ID', commentary.record_id())
@@ -671,16 +866,21 @@ class Tester(object):
     def __init__(self, cmd):
         self.inputs = cmd.inputs
         self.exit_value = 0
+        self.all_records = defaultdict(dict)
+        self.concurrent_to = defaultdict(list)
 
     def process_all(self):
-        for warc in self.inputs:
-            print(warc)
+        for warcfile in self.inputs:
+            print(warcfile)
             try:
-                self.process_one(warc)
+                self.process_one(warcfile)
             except ArchiveLoadFailed as e:
                 print('  saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr)
                 print('  skipping rest of file', file=sys.stderr)
+
+        check_global(self.all_records, self.concurrent_to)
+
         return self.exit_value
 
-    def process_one(self, filename):
-        _process_one(filename)
+    def process_one(self, warcfile):
+        _process_one(warcfile, self.all_records, self.concurrent_to)

From f7cd1dbb28cb4033bd70878729f86479a0643971 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Thu, 31 Jan 2019 12:03:44 -0800
Subject: [PATCH 58/68] check -v; capitalize most commentary

---
 warcio/cli.py    |  1 +
 warcio/tester.py | 89 +++++++++++++++++++++++++-----------------------
 2 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/warcio/cli.py b/warcio/cli.py
index ada44f12..bbe51a93 100644
--- a/warcio/cli.py
+++ b/warcio/cli.py
@@ -55,6 +55,7 @@ def main(args=None):
 
     test = subparsers.add_parser('test', help='WARC standards tester')
     test.add_argument('inputs', nargs='+')
+    test.add_argument('-v', '--verbose', action='store_true')
     test.set_defaults(func=tester)
 
     cmd = parser.parse_args(args=args)
diff --git a/warcio/tester.py b/warcio/tester.py
index 870c7d6e..9605ea7b 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -157,7 +157,7 @@ def validate_response(record, commentary, pending):
     if target_uri.startswith('http:') or target_uri.startswith('https:'):
         content_type = record.rec_headers.get_header('Content-Type', 'none')
         if canon_content_type(content_type) not in {'application/http; msgtype=response', 'application/http'}:
-            commentary.error('responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type)
+            commentary.error('Responses for http/https should have Content-Type of application/http; msgtype=response or application/http:', content_type)
 
         if record.rec_headers.get_header('WARC-IP-Address') is None:
             commentary.error('WARC-IP-Address should be used for http and https responses')
@@ -264,7 +264,7 @@ def validate_revisit(record, commentary, pending):
         #     if yes, should be like a response record, truncated if desired
         #   WARC-Refers-To-Date should be the same as WARC-Date in the original record if present
     else:
-        commentary.comment('no revisit details validation done due to unknown profile:', warc_profile)
+        commentary.comment('No revisit details validation done due to unknown profile:', warc_profile)
 
 
 def validate_conversion(record, commentary, pending):
@@ -291,14 +291,17 @@ def validate_unbracketed_uri(field, value, record, version, commentary, pending)
     if value.startswith('<') or value.endswith('>'):
         # wget 1.19 bug caused by WARC 1.0 spec error
         commentary.error('uri must not be within <>:', field, value)
+        value = value[1:-1]
+
+    scheme = value.split(':', 1)[0]
     if ':' not in value:
-        commentary.error('invalid uri, no scheme:', field, value)
+        commentary.error('Invalid uri, no scheme:', field, value)
+    elif not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
+        commentary.error('Invalid uri scheme, bad character:', field, value)
+        # use https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml ??
+
     if re.search(r'\s', value):
-        commentary.error('invalid uri, contains whitespace:', field, value)
-    scheme = value.split(':', 1)[0]
-    if not re.search(r'\A[A-Za-z][A-Za-z0-9+\-\.]*\Z', scheme):
-        commentary.error('invalid uri scheme, bad character:', field, value)
-    # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
+        commentary.error('Invalid uri, contains whitespace:', field, value)
 
 
 def validate_warc_type(field, value, record, version, commentary, pending):
@@ -307,7 +310,7 @@ def validate_warc_type(field, value, record, version, commentary, pending):
         commentary.comment('WARC-Type is not lower-case:', field, value)
     if value.lower() not in record_types:
         # standard says readers should ignore unknown warc-types
-        commentary.comment('unknown WARC-Type:', field, value)
+        commentary.comment('Unknown WARC-Type:', field, value)
 
 
 def validate_bracketed_uri(field, value, record, version, commentary, pending):
@@ -337,7 +340,7 @@ def validate_timestamp(field, value, record, version, commentary, pending):
 
 def validate_content_length(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer:', field, value)
+        commentary.error('Must be an integer:', field, value)
 
 
 token_re = r'\A[!"#$%&\'()*+\-\.0-9A-Z\^_`a-z|~]+\Z'
@@ -346,7 +349,7 @@ def validate_content_length(field, value, record, version, commentary, pending):
 
 def validate_content_type(field, value, record, version, commentary, pending):
     if '/' not in value:
-        commentary.error('must contain a /:', field, value)
+        commentary.error('Must contain a /:', field, value)
     splits = value.split('/', 1)
     ctype = splits[0]
     if len(splits) > 1:
@@ -354,13 +357,13 @@ def validate_content_type(field, value, record, version, commentary, pending):
     else:
         rest = ''
     if not re.search(token_re, ctype):
-        commentary.error('invalid type:', field, value)
+        commentary.error('Invalid type:', field, value)
     if ';' in rest:
         subtype, rest = rest.split(';', 1)
     else:
         subtype = rest
     if not re.search(token_re, subtype):
-        commentary.error('invalid subtype:', field, value)
+        commentary.error('Invalid subtype:', field, value)
 
     # at this point there can be multiple parameters,
     # some of which could have quoted string values with ; in them
@@ -368,7 +371,7 @@ def validate_content_type(field, value, record, version, commentary, pending):
 
 def validate_digest(field, value, record, version, commentary, pending):
     if ':' not in value:
-        commentary.error('missing algorithm:', field, value)
+        commentary.error('Missing algorithm:', field, value)
     splits = value.split(':', 1)
     algorithm = splits[0]
     if len(splits) > 1:
@@ -376,12 +379,12 @@ def validate_digest(field, value, record, version, commentary, pending):
     else:
         digest = 'none'
     if not re.search(token_re, algorithm):
-        commentary.error('invalid algorithm:', field, value)
+        commentary.error('Invalid algorithm:', field, value)
     else:
         try:
             Digester(algorithm)
         except ValueError:
-            commentary.comment('unknown digest algorithm:', field, value)
+            commentary.comment('Unknown digest algorithm:', field, value)
     if not re.search(token_re, digest):
         # https://github.com/iipc/warc-specifications/issues/48
         # commentary.comment('spec incorrectly says this is an invalid digest', field, value)
@@ -398,14 +401,14 @@ def validate_ip(field, value, record, version, commentary, pending):
             value = unicode(value)
         ipaddress.ip_address(value)
     except ValueError:
-        commentary.error('invalid ip:', field, value)
+        commentary.error('Invalid ip:', field, value)
     except (ImportError, NameError):  # pragma: no cover
-        commentary.comment('did not check ip address format, install ipaddress module from pypi if you care')
+        commentary.comment('Did not check ip address format, install ipaddress module from pypi if you care')
 
 
 def validate_truncated(field, value, record, version, commentary, pending):
     if value.lower() not in {'length', 'time', 'disconnect', 'unspecified'}:
-        commentary.comment('unknown value, perhaps an extension:', field, value)
+        commentary.comment('Unknown value, perhaps an extension:', field, value)
 
 
 def validate_warcinfo_id(field, value, record, version, commentary, pending):
@@ -440,7 +443,7 @@ def validate_profile(field, value, record, version, commentary, pending):
         if profiles_rev[value] != version:
             commentary.comment('WARC-Profile value is for a different version:', version, value)
     else:
-        commentary.comment('unknown value, perhaps an extension:', field, value)
+        commentary.comment('Unknown value, perhaps an extension:', field, value)
 
     if '/revisit/uri-agnostic-identical-payload-digest' in value:
         commentary.comment('This Heretrix extension never made it into the standard:', field, value)
@@ -448,26 +451,26 @@ def validate_profile(field, value, record, version, commentary, pending):
 
 def validate_segment_number(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer:', field, value)
+        commentary.error('Must be an integer:', field, value)
         return
     iv = int(value)
     if iv == 0:
-        commentary.error('must be 1 or greater:', field, value)
+        commentary.error('Must be 1 or greater:', field, value)
 
     rec_type = record.rec_headers.get_header('WARC-Type', 'none')
     if rec_type != 'continuation':
         if iv != 1:
-            commentary.error('non-continuation records must always have WARC-Segment-Number: 1:', field, value)
+            commentary.error('Non-continuation records must always have WARC-Segment-Number: 1:', field, value)
         origin_id = record.rec_headers.get_header('WARC-Segment-Origin-ID')
         if origin_id is None:
-            commentary.error('segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID')
+            commentary.error('Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID')
     if rec_type in {'warcinfo', 'request', 'metadata', 'revisit'}:
-        commentary.recommendation('do not segment WARC-Type', rec_type)
+        commentary.recommendation('Do not segment WARC-Type', rec_type)
 
 
 def validate_segment_total_length(field, value, record, version, commentary, pending):
     if not value.isdigit():
-        commentary.error('must be an integer:', field, value)
+        commentary.error('Must be an integer:', field, value)
 
 
 def validate_refers_to_filename(field, value, record, version, commentary, pending):
@@ -525,6 +528,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe
         'validate': validate_profile,
     },
     'WARC-Identified-Payload-Type': {
+        # see also https://github.com/iipc/warc-specifications/issues/49 -- odd that it's allowed for request, revisit, continuation
         'validate': validate_content_type,
     },
     'WARC-Segment-Origin-ID': {
@@ -565,7 +569,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'Content-Type', 'WARC-Target-URI'],
         'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-IP-Address', 'WARC-Truncated',
-                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
+                     'WARC-Concurrent-To', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_response,
     },
@@ -605,7 +609,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe
     },
     'conversion': {
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type', 'WARC-Target-URI'],
-        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
+        'optional': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Truncated', 'WARC-Refers-To', 'WARC-Warcinfo-ID', 'WARC-Identified-Payload-Type', 'WARC-Segment-Number', 'WARC-Segment-Origin-ID'],
         'prohibited': ['WARC-Concurrent-To', 'WARC-IP-Address', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
         'validate': validate_conversion,
     },
@@ -613,7 +617,7 @@ def validate_refers_to_file_offset(field, value, record, version, commentary, pe
         'required': ['WARC-Record-ID', 'Content-Length', 'WARC-Date', 'WARC-Type',
                      'WARC-Segment-Number', 'WARC-Segment-Origin-ID', 'WARC-Target-URI'],
         'optional': ['WARC-Segment-Total-Length', 'WARC-Truncated'],
-        'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile'],
+        'prohibited': ['WARC-Block-Digest', 'WARC-Payload-Digest', 'WARC-Warcinfo-ID', 'WARC-IP-Address', 'WARC-Refers-To', 'WARC-Refers-To-Target-URI', 'WARC-Refers-To-Date', 'WARC-Filename', 'WARC-Profile', 'WARC-Identified-Payload-Type'],
         'validate': validate_continuation,
     },
 }
@@ -629,17 +633,17 @@ def make_header_set(config, kinds):
 def validate_fields_against_rec_type(rec_type, config, rec_headers, commentary, allow_all=False):
     for req in sorted(config.get('required', [])):
         if not rec_headers.get_header(req):
-            commentary.error('missing required header:', req)
+            commentary.error('Missing required header:', req)
     for rec in sorted(config.get('recommended', [])):
         if not rec_headers.get_header(rec):
-            commentary.recommendation('missing recommended header:', rec)
+            commentary.recommendation('Missing recommended header:', rec)
     allowed = make_header_set(config, ('required', 'optional', 'recommended', 'ignored'))
     prohibited = make_header_set(config, ('prohibited',))
 
     for field, value in rec_headers.headers:  # XXX not exported
         fl = field.lower()
         if fl in prohibited:
-            commentary.error('field not allowed in record type:', rec_type, field)
+            commentary.error('Field not allowed in record type:', rec_type, field)
         elif allow_all or fl in allowed:
             pass
         elif fl in warc_fields:  # pragma: no cover (this is a tester.py configuration omission)
@@ -666,15 +670,15 @@ def validate_record(record):
     for field, value in record.rec_headers.headers:  # XXX not exported
         field_l = field.lower()
         if field_l != 'warc-concurrent-to' and field_l in seen_fields:
-            commentary.error('duplicate field seen:', field, value)
+            commentary.error('Duplicate field seen:', field, value)
         seen_fields.add(field_l)
         if field_l not in warc_fields:
-            commentary.comment('unknown field, no validation performed:', field, value)
+            commentary.comment('Unknown field, no validation performed:', field, value)
             continue
         config = warc_fields[field_l]
         if 'minver' in config:
             if version < config['minver']:
-                commentary.comment('field was introduced after this warc version:', version, field, value)
+                commentary.comment('Field was introduced after this warc version:', version, field, value)
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 
@@ -780,13 +784,13 @@ def _revisit_compare(record_id, fields, source_field, wanted_id, all_records, ta
         return
 
     if target_field.lower() not in all_records[wanted_id]:
-        commentary.comment('revisit target lacks field:', wanted_id, target_field)
+        commentary.comment('Revisit target lacks field:', wanted_id, target_field)
         return
 
     source_value = fields[source_field.lower()]
     target_value = all_records[wanted_id][target_field.lower()]
     if source_value != target_value:
-        commentary.comment('revisit and revisit target disagree:',
+        commentary.comment('Revisit and revisit target disagree:',
                            record_id, source_field, source_value,
                            wanted_id, target_field, target_value)
 
@@ -827,7 +831,7 @@ def check_global_segment(all_records):
     pass
 
 
-def _process_one(warcfile, all_records, concurrent_to):
+def _process_one(warcfile, all_records, concurrent_to, verbose):
     if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'):
         return
     with open(warcfile, 'rb') as stream:
@@ -842,7 +846,7 @@ def _process_one(warcfile, all_records, concurrent_to):
 
             record.stream_for_digest_check()
 
-            if commentary.has_comments() or record.digest_checker.passed is False:
+            if verbose or commentary.has_comments() or record.digest_checker.passed is False:
                 print(' ', 'WARC-Record-ID', commentary.record_id())
                 print('   ', 'WARC-Type', commentary.rec_type())
 
@@ -865,6 +869,7 @@ def _process_one(warcfile, all_records, concurrent_to):
 class Tester(object):
     def __init__(self, cmd):
         self.inputs = cmd.inputs
+        self.verbose = cmd.verbose
         self.exit_value = 0
         self.all_records = defaultdict(dict)
         self.concurrent_to = defaultdict(list)
@@ -875,12 +880,12 @@ def process_all(self):
             try:
                 self.process_one(warcfile)
             except ArchiveLoadFailed as e:
-                print('  saw exception ArchiveLoadFailed: '+str(e).rstrip(), file=sys.stderr)
-                print('  skipping rest of file', file=sys.stderr)
+                print('  saw exception ArchiveLoadFailed: '+str(e).rstrip())
+                print('  skipping rest of file')
 
         check_global(self.all_records, self.concurrent_to)
 
         return self.exit_value
 
     def process_one(self, warcfile):
-        _process_one(warcfile, self.all_records, self.concurrent_to)
+        _process_one(warcfile, self.all_records, self.concurrent_to, self.verbose)

From b570b6c09cd9e32110b5127721e4676ac7405d02 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Thu, 31 Jan 2019 21:49:41 -0800
Subject: [PATCH 59/68] ...

---
 test/test_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index ebbdb509..9c3c9fec 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -342,7 +342,7 @@ def test_leftovers():
     warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
 
     expected = '''\
-error: must be an integer: Content-Length not-an-integer
+error: Must be an integer: Content-Length not-an-integer
 '''
 
     assert '\n'.join(commentary.comments())+'\n' == expected

From 921e7486d2ef84da7684833acce41ae3957710dc Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 1 Feb 2019 10:25:40 -0800
Subject: [PATCH 60/68] revisits and global detection with just one file

---
 warcio/tester.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index 9605ea7b..68f108b2 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -722,7 +722,10 @@ def save_global_info(record, warcfile, commentary, all_records, concurrent_to):
             save['warc-concurrent-to'].append(value)
 
     if record_id in all_records:
-        commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile'])
+        if warcfile != all_records[record_id]['warcfile']:
+            commentary.error('Duplicate WARC-Record-ID:', record_id, 'found in files', warcfile, all_records[record_id]['warcfile'])
+        else:
+            commentary.error('Duplicate WARC-Record-ID:', record_id)
     else:
         all_records[record_id] = save
 
@@ -853,9 +856,12 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
                 if record.digest_checker.passed is True:
                     print('    digest pass')
                 elif record.digest_checker.passed is None:
-                    if digest_present:  # pragma: no cover
-                        # WARC record missing Content-Length: header, which is verboten
-                        print('    digest present but not checked')
+                    if digest_present:
+                        if commentary.rec_type() == 'revisit':
+                            print('    digest present but not checked (revisit)')
+                        else:  # pragma: no cover
+                            # WARC record missing Content-Length: header, which is verboten
+                            print('    digest present but not checked')
                     else:
                         print('    digest not present')
                 for p in record.digest_checker.problems:

From 4265b62e5450cdb0b9d694a0d9e76124b2c4ec8c Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 1 Feb 2019 15:47:01 -0800
Subject: [PATCH 61/68] show errors for decompression and unchunking failures

---
 test/test_tests.py        | 14 +++++++-------
 warcio/archiveiterator.py |  5 +++--
 warcio/bufferedreaders.py | 17 ++++++++++++++---
 warcio/recordloader.py    | 10 ++++++----
 warcio/tester.py          | 12 ++++++++++--
 5 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/test/test_tests.py b/test/test_tests.py
index 9c3c9fec..200df8ae 100644
--- a/test/test_tests.py
+++ b/test/test_tests.py
@@ -297,17 +297,17 @@ def test_digests():
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest pass
     error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example-digest-bad.warc test/data/example-digest-bad.warc
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
 test/data/example.warc
   WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
@@ -316,11 +316,11 @@ def test_digests():
     error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example.warc test/data/example-digest-bad.warc
   WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
     WARC-Type revisit
-    digest present but not checked
-    recommendation: missing recommended header: WARC-Refers-To
+    digest present but not checked (revisit)
+    recommendation: Missing recommended header: WARC-Refers-To
     comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest
-    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
-    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
   WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
     WARC-Type request
     digest not present
diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py
index 24094936..5e9c02ca 100644
--- a/warcio/archiveiterator.py
+++ b/warcio/archiveiterator.py
@@ -56,13 +56,14 @@ class ArchiveIterator(six.Iterator):
     def __init__(self, fileobj, no_record_parse=False,
                  verify_http=False, arc2warc=False,
                  ensure_http_headers=False, block_size=BUFF_SIZE,
-                 check_digests=False, fixup_bugs=True):
+                 check_digests=False, fixup_bugs=True, raise_exceptions=False):
 
         self.fh = fileobj
 
         self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                           arc2warc=arc2warc,
-                                          fixup_bugs=fixup_bugs)
+                                          fixup_bugs=fixup_bugs,
+                                          raise_exceptions=raise_exceptions)
         self.known_format = None
 
         self.mixed_arc_warc = arc2warc
diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py
index 0b7f72f7..734ce23a 100644
--- a/warcio/bufferedreaders.py
+++ b/warcio/bufferedreaders.py
@@ -36,6 +36,13 @@ def brotli_decompressor():
         pass
 
 
+#=================================================================
+class DecompressionException(Exception):
+    def __init__(self, msg, data=b''):
+        Exception.__init__(self, msg)
+        self.data = data
+
+
 #=================================================================
 class BufferedReader(object):
     """
@@ -64,7 +71,8 @@ class BufferedReader(object):
     def __init__(self, stream, block_size=BUFF_SIZE,
                  decomp_type=None,
                  starting_data=None,
-                 read_all_members=False):
+                 read_all_members=False,
+                 raise_exceptions=False):
 
         self.stream = stream
         self.block_size = block_size
@@ -77,6 +85,7 @@ def __init__(self, stream, block_size=BUFF_SIZE,
         self.buff_size = 0
 
         self.read_all_members = read_all_members
+        self.raise_exceptions = raise_exceptions
 
     def set_decomp(self, decomp_type):
         self._init_decomp(decomp_type)
@@ -142,6 +151,8 @@ def _decompress(self, data):
                         self._init_decomp('deflate_alt')
                         data = self._decompress(data)
                     else:
+                        if self.raise_exceptions:
+                            raise DecompressionException(str(e))
                         self.decompressor = None
                 # otherwise (partly decompressed), something is wrong
                 else:
@@ -280,13 +291,13 @@ class ChunkedDataReader(BufferedReader):
     If at any point the chunked header is not available, the stream is
     assumed to not be chunked and no more dechunking occurs.
     """
-    def __init__(self, stream, raise_exceptions=False, **kwargs):
+    def __init__(self, stream, **kwargs):
         super(ChunkedDataReader, self).__init__(stream, **kwargs)
         self.all_chunks_read = False
         self.not_chunked = False
 
         # if False, we'll use best-guess fallback for parse errors
-        self.raise_chunked_data_exceptions = raise_exceptions
+        self.raise_chunked_data_exceptions = kwargs.get('raise_exceptions')
 
     def _fillbuff(self, block_size=None):
         if self.not_chunked:
diff --git a/warcio/recordloader.py b/warcio/recordloader.py
index 2f48233b..3903f4b1 100644
--- a/warcio/recordloader.py
+++ b/warcio/recordloader.py
@@ -23,6 +23,7 @@ def __init__(self, *args, **kwargs):
          self.http_headers, self.content_type, self.length) = args
         self.payload_length = -1
         self.digest_checker = kwargs.get('digest_checker')
+        self.raise_exceptions = kwargs.get('raise_exceptions')
 
     def content_stream(self):
         if not self.http_headers:
@@ -37,9 +38,9 @@ def content_stream(self):
                 encoding = None
 
         if self.http_headers.get_header('transfer-encoding') == 'chunked':
-            return ChunkedDataReader(self.raw_stream, decomp_type=encoding)
+            return ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
         elif encoding:
-            return BufferedReader(self.raw_stream, decomp_type=encoding)
+            return BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
         else:
             return self.raw_stream
 
@@ -58,7 +59,7 @@ class ArcWarcRecordLoader(object):
     NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
     HTTP_SCHEMES = ('http:', 'https:')
 
-    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True):
+    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_exceptions=False):
         if arc2warc:
             self.arc_parser = ARC2WARCHeadersParser()
         else:
@@ -69,6 +70,7 @@ def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True):
 
         self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
         self.fixup_bugs = fixup_bugs
+        self.raise_exceptions = raise_exceptions
 
     def parse_record_stream(self, stream,
                             statusline=None,
@@ -150,7 +152,7 @@ def parse_record_stream(self, stream,
 
         return ArcWarcRecord(the_format, rec_type,
                              rec_headers, stream, http_headers,
-                             content_type, length, digest_checker=digest_checker)
+                             content_type, length, digest_checker=digest_checker, raise_exceptions=self.raise_exceptions)
 
     def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None):
         payload_digest = rec_headers.get_header('WARC-Payload-Digest')
diff --git a/warcio/tester.py b/warcio/tester.py
index 68f108b2..84167c4c 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -8,6 +8,7 @@
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
 from warcio.exceptions import ArchiveLoadFailed
+from warcio.bufferedreaders import ChunkedDataException, DecompressionException
 
 
 class Commentary(object):
@@ -838,7 +839,7 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
     if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'):
         return
     with open(warcfile, 'rb') as stream:
-        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
+        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True):
 
             record = WrapRecord(record)
             digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
@@ -847,7 +848,14 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
             commentary = validate_record(record)
             save_global_info(record, warcfile, commentary, all_records, concurrent_to)
 
-            record.stream_for_digest_check()
+            try:
+                record.stream_for_digest_check()
+            except ChunkedDataException:
+                commentary.error('Transfer-Encoding: chunked, saw an error attempting to unchunk')
+                pass
+            except DecompressionException as e:
+                commentary.error('Content-Encoding indicates compression, saw an error attempting to decompress: '+str(e))
+                pass
 
             if verbose or commentary.has_comments() or record.digest_checker.passed is False:
                 print(' ', 'WARC-Record-ID', commentary.record_id())

From 08e6bd9c88ab743c1794d15b7ec79711d6db6808 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 1 Feb 2019 22:13:07 -0800
Subject: [PATCH 62/68] make this function reentrant

---
 warcio/recordloader.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/warcio/recordloader.py b/warcio/recordloader.py
index 3903f4b1..d5523f75 100644
--- a/warcio/recordloader.py
+++ b/warcio/recordloader.py
@@ -24,11 +24,15 @@ def __init__(self, *args, **kwargs):
         self.payload_length = -1
         self.digest_checker = kwargs.get('digest_checker')
         self.raise_exceptions = kwargs.get('raise_exceptions')
+        self._content_stream = None
 
     def content_stream(self):
         if not self.http_headers:
             return self.raw_stream
 
+        if self._content_stream:
+            return self._content_stream
+
         encoding = self.http_headers.get_header('content-encoding')
 
         if encoding:
@@ -38,11 +42,13 @@ def content_stream(self):
                 encoding = None
 
         if self.http_headers.get_header('transfer-encoding') == 'chunked':
-            return ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
+            self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
         elif encoding:
-            return BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
+            self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
         else:
-            return self.raw_stream
+            self._content_stream = self.raw_stream
+
+        return self._content_stream
 
 
 #=================================================================

From d1f48ed5dc108b6038e9e54cc73788556e75ab2e Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Fri, 1 Feb 2019 22:13:26 -0800
Subject: [PATCH 63/68] narrow exception; fix bug not reading to the end of a
 chunked buffer

---
 warcio/bufferedreaders.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py
index 734ce23a..74adae51 100644
--- a/warcio/bufferedreaders.py
+++ b/warcio/bufferedreaders.py
@@ -38,9 +38,8 @@ def brotli_decompressor():
 
 #=================================================================
 class DecompressionException(Exception):
-    def __init__(self, msg, data=b''):
+    def __init__(self, msg):
         Exception.__init__(self, msg)
-        self.data = data
 
 
 #=================================================================
@@ -144,7 +143,7 @@ def _decompress(self, data):
         if self.decompressor and data:
             try:
                 data = self.decompressor.decompress(data)
-            except Exception as e:
+            except zlib.error as e:
                 # if first read attempt, assume non-gzipped stream
                 if self.num_block_read == 0:
                     if self.decomp_type == 'deflate':

From 6e44a44af2278e09cc35001829a7b9acb34779c9 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sat, 2 Feb 2019 09:30:51 -0800
Subject: [PATCH 64/68] ...

---
 warcio/tester.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index 84167c4c..84ea75c3 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -840,6 +840,7 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
         return
     with open(warcfile, 'rb') as stream:
         for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True):
+        #for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
 
             record = WrapRecord(record)
             digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
@@ -850,11 +851,11 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
 
             try:
                 record.stream_for_digest_check()
-            except ChunkedDataException:
-                commentary.error('Transfer-Encoding: chunked, saw an error attempting to unchunk')
+            except ChunkedDataException as e:
+                commentary.comment('Transfer-Encoding: chunked, saw exception: '+str(e))
                 pass
             except DecompressionException as e:
-                commentary.error('Content-Encoding indicates compression, saw an error attempting to decompress: '+str(e))
+                commentary.comment('Content-Encoding indicates compression, saw: '+str(e))
                 pass
 
             if verbose or commentary.has_comments() or record.digest_checker.passed is False:

From 59198eb4f77292565fe65613fa662cce449f4db6 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Wed, 6 Feb 2019 11:53:02 -0800
Subject: [PATCH 65/68] put tester output in external files

---
 test/data/example-digest-bad.warc.test        |  22 ++
 test/data/example.warc.test                   |  16 +
 .../standard-torture-validate-field.warc.test |  80 ++++
 ...standard-torture-validate-record.warc.test | 112 ++++++
 test/test_tester.py                           |  96 +++++
 test/test_tests.py                            | 348 ------------------
 6 files changed, 326 insertions(+), 348 deletions(-)
 create mode 100644 test/data/example-digest-bad.warc.test
 create mode 100644 test/data/example.warc.test
 create mode 100644 test/data/standard-torture-validate-field.warc.test
 create mode 100644 test/data/standard-torture-validate-record.warc.test
 create mode 100644 test/test_tester.py
 delete mode 100644 test/test_tests.py

diff --git a/test/data/example-digest-bad.warc.test b/test/data/example-digest-bad.warc.test
new file mode 100644
index 00000000..15a5efaf
--- /dev/null
+++ b/test/data/example-digest-bad.warc.test
@@ -0,0 +1,22 @@
+test/data/example-digest-bad.warc
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest pass
+    error: WARC-IP-Address should be used for http and https requests
+    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+global Concurrent-To checks
+  comment: WARC-Concurrent-To not found: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> WARC-Concurrent-To <urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>
diff --git a/test/data/example.warc.test b/test/data/example.warc.test
new file mode 100644
index 00000000..52b3c79f
--- /dev/null
+++ b/test/data/example.warc.test
@@ -0,0 +1,16 @@
+test/data/example.warc
+  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest not present
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
+    WARC-Type revisit
+    digest present but not checked (revisit)
+    recommendation: Missing recommended header: WARC-Refers-To
+    comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
+  WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
+    WARC-Type request
+    digest not present
+    error: WARC-IP-Address should be used for http and https requests
diff --git a/test/data/standard-torture-validate-field.warc.test b/test/data/standard-torture-validate-field.warc.test
new file mode 100644
index 00000000..de2e3fe1
--- /dev/null
+++ b/test/data/standard-torture-validate-field.warc.test
@@ -0,0 +1,80 @@
+test/data/standard-torture-validate-field.warc
+  WARC-Record-ID <urn:uuid:torture-validate-field>
+    WARC-Type does-not-exist
+    unknown hash algorithm name in block digest
+    error: uri must not be within <>: WARC-Target-URI <http://example.com/>
+    error: Duplicate field seen: WARC-Target-URI example.com
+    error: Invalid uri, no scheme: WARC-Target-URI example.com
+    error: Duplicate field seen: WARC-Target-URI ex ample.com
+    error: Invalid uri, no scheme: WARC-Target-URI ex ample.com
+    error: Invalid uri, contains whitespace: WARC-Target-URI ex ample.com
+    error: Duplicate field seen: WARC-Target-URI h<>ttp://example.com/
+    error: Invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/
+    error: Duplicate field seen: WARC-Type CAPITALIZED
+    error: uri must be within <>: WARC-Concurrent-To http://example.com/
+    error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
+    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
+    error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
+    error: Must contain a /: Content-Type asdf
+    error: Invalid subtype: Content-Type asdf
+    error: Duplicate field seen: Content-Type has space/asdf
+    error: Invalid type: Content-Type has space/asdf
+    error: Duplicate field seen: Content-Type asdf/has space
+    error: Invalid subtype: Content-Type asdf/has space
+    error: Duplicate field seen: Content-Type asdf/has space;asdf
+    error: Invalid subtype: Content-Type asdf/has space;asdf
+    error: Missing algorithm: WARC-Block-Digest asdf
+    error: Duplicate field seen: WARC-Block-Digest has space:asdf
+    error: Invalid algorithm: WARC-Block-Digest has space:asdf
+    error: Duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^
+    error: Invalid ip: WARC-IP-Address 1.2.3.4.5
+    error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf
+    error: Duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
+    error: Must contain a /: WARC-Identified-Payload-Type asdf
+    error: Invalid subtype: WARC-Identified-Payload-Type asdf
+    error: uri must be within <>: WARC-Segment-Origin-ID http://example.com
+    error: Must be an integer: WARC-Segment-Number not-an-integer
+    error: Duplicate field seen: WARC-Segment-Number 0
+    error: Must be 1 or greater: WARC-Segment-Number 0
+    error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0
+    error: Duplicate field seen: WARC-Segment-Number 1
+    error: Duplicate field seen: WARC-Segment-Number 2
+    error: Non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2
+    error: Duplicate field seen: WARC-Segment-Total-Length not-an-integer
+    error: Must be an integer: WARC-Segment-Total-Length not-an-integer
+    error: Invalid timestamp: WARC-Refers-To-Date not-a-date
+    comment: Unknown WARC-Type: WARC-Type does-not-exist
+    comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED
+    comment: Unknown WARC-Type: WARC-Type CAPITALIZED
+    comment: Unknown digest algorithm: WARC-Block-Digest asdf
+    comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^
+    comment: Unknown value, perhaps an extension: WARC-Truncated invalid
+    comment: Unknown value, perhaps an extension: WARC-Profile asdf
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com
+    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date
+    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf
+    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234
+    comment: Unknown field, no validation performed: WARC-Unknown-Field asdf
+  WARC-Record-ID None
+    WARC-Type invalid
+    digest not present
+    error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
+    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
+    error: Duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z
+    comment: Unknown WARC-Type: WARC-Type invalid
+  WARC-Record-ID None
+    WARC-Type request
+    digest not present
+    error: Segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Record-ID
+    error: Missing required header: WARC-Target-URI
+    recommendation: Do not segment WARC-Type request
+  saw exception ArchiveLoadFailed: Invalid WARC record, first line: WARC/invalid
+  skipping rest of file
+global warcinfo checks
+  comment: WARC-Warcinfo-ID not found: <urn:uuid:torture-validate-field> WARC-Warcinfo-ID asdf:asdf
+global Concurrent-To checks
+  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To <uri:urn:asdf-asdf-asdf>
+  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To http://example.com/
diff --git a/test/data/standard-torture-validate-record.warc.test b/test/data/standard-torture-validate-record.warc.test
new file mode 100644
index 00000000..e7b17345
--- /dev/null
+++ b/test/data/standard-torture-validate-record.warc.test
@@ -0,0 +1,112 @@
+test/data/standard-torture-validate-record.warc
+  WARC-Record-ID None
+    WARC-Type warcinfo
+    digest not present
+    error: uri must be within <>: WARC-Refers-To probhibited
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Record-ID
+    error: Field not allowed in record type: warcinfo WARC-Refers-To
+    error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
+    comment: The first line of warc-fields cannot start with whitespace
+    comment: warc-fields lines must end with \r\n: test: lines should end with \r\n
+    comment: Missing colon in warc-fields line: no colon
+    comment: Invalid warc-fields name: token cannot have a space
+  WARC-Record-ID <uri:uuid:test-empty-warc-fields>
+    WARC-Type warcinfo
+    digest not present
+    error: Missing required header: WARC-Date
+    comment: warc-fields block present but empty
+  WARC-Record-ID <uri:uuid:test-warcinfo-non-recommended-content-type>
+    WARC-Type warcinfo
+    digest not present
+    error: Missing required header: WARC-Date
+    recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields
+  WARC-Record-ID <uri:uuid:test-response-content-type>
+    WARC-Type response
+    digest not present
+    error: Missing required header: WARC-Date
+    error: Responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain
+    error: WARC-IP-Address should be used for http and https responses
+    error: http/https responses should have http headers
+  WARC-Record-ID <uri:uuid:test-resource-dns-content-type>
+    WARC-Type resource
+    digest not present
+    error: Missing required header: WARC-Date
+    error: resource records for dns shall have Content-Type of text/dns: text/plain
+  WARC-Record-ID <uri:uuid:test-resource-dns-empty>
+    WARC-Type resource
+    digest not present
+    error: Missing required header: WARC-Date
+    comment: Unknown field, no validation performed: WARC-Test-TODO add another with valid block
+  WARC-Record-ID <uri:uuid:test-resource-not-dns>
+    WARC-Type resource
+    digest not present
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+  WARC-Record-ID <uri:uuid:test-request-content-type>
+    WARC-Type request
+    digest not present
+    error: Missing required header: WARC-Date
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
+    error: WARC-IP-Address should be used for http and https requests
+  WARC-Record-ID <uri:uuid:test-request-content-type-with-ip>
+    WARC-Type request
+    digest not present
+    error: Missing required header: WARC-Date
+    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
+  WARC-Record-ID <uri:uuid:test-metadata-warc-fields-empty>
+    WARC-Type metadata
+    digest not present
+    error: Missing required header: WARC-Date
+    comment: warc-fields block present but empty
+  WARC-Record-ID <uri:uuid:test-metadata-not-warc-fields>
+    WARC-Type metadata
+    digest not present
+    error: Missing required header: WARC-Date
+  WARC-Record-ID <uri:uuid:test-revisit-profile-unknown>
+    WARC-Type revisit
+    digest not present
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Target-URI
+    comment: Unknown value, perhaps an extension: WARC-Profile none
+    comment: No revisit details validation done due to unknown profile: none
+  WARC-Record-ID <uri:uuid:test-revisit-profile-future>
+    WARC-Type revisit
+    digest not present
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Target-URI
+    error: Missing required header: WARC-Payload-Digest
+    recommendation: Missing recommended header: WARC-Refers-To
+    recommendation: Missing recommended header: WARC-Refers-To-Date
+    recommendation: Missing recommended header: WARC-Refers-To-Target-URI
+    comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
+  WARC-Record-ID <uri:uuid:test-revisit-profile-good>
+    WARC-Type revisit
+    digest not present
+    error: Missing required header: Content-Type
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Target-URI
+    recommendation: Missing recommended header: WARC-Refers-To
+    recommendation: Missing recommended header: WARC-Refers-To-Date
+  WARC-Record-ID <uri:uuid:test-conversion>
+    WARC-Type conversion
+    digest not present
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Target-URI
+  WARC-Record-ID <uri:uuid:test-continuation-segment-1>
+    WARC-Type continuation
+    digest not present
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Segment-Origin-ID
+    error: Missing required header: WARC-Target-URI
+    error: continuation record must have WARC-Segment-Number > 1: 1
+    comment: warcio test continuation code has not been tested, expect bugs
+  WARC-Record-ID <uri:uuid:test-continuation-segment-valid>
+    WARC-Type continuation
+    digest not present
+    error: Missing required header: WARC-Date
+    error: Missing required header: WARC-Segment-Origin-ID
+    error: Missing required header: WARC-Target-URI
+    comment: warcio test continuation code has not been tested, expect bugs
diff --git a/test/test_tester.py b/test/test_tester.py
new file mode 100644
index 00000000..49b1cc6d
--- /dev/null
+++ b/test/test_tester.py
@@ -0,0 +1,96 @@
+from warcio.cli import main
+from warcio.utils import to_native_str
+import warcio.tester
+
+from . import get_test_file
+from .test_cli import patch_stdout
+
+
+file_map = {}
+
+
+def map_test_file(filename):
+    file_map[filename] = get_test_file(filename)
+    return file_map[filename]
+
+
+def helper(args, expected_exit_value):
+    with patch_stdout() as buff:
+        exit_value = None
+        try:
+            main(args=args)
+        except SystemExit as e:
+            exit_value = e.code
+        finally:
+            assert exit_value == expected_exit_value
+
+        return to_native_str(buff.getvalue())
+
+
+def remove_before_test_data(s):
+    ret = ''
+    for line in s.splitlines(True):
+        for filename, value in file_map.items():
+            if value in line:
+                line = line.replace(value, 'test/data/' + filename)
+        ret += line
+    return ret
+
+
+def run_one(f):
+    args = ['test']
+    args.append(f)
+
+    with open(f+'.test', 'r') as expectedf:
+        expected = expectedf.read()
+
+    value = helper(args, 0)
+    print(remove_before_test_data(value))
+
+    actual = remove_before_test_data(value)
+
+    assert actual == expected
+
+
+def test_torture():
+    files = ['standard-torture-validate-record.warc',
+             'standard-torture-validate-field.warc']
+    [run_one(map_test_file(filename)) for filename in files]
+
+
+def test_arc():
+    files = ['does-not-exist.arc']
+    files = [map_test_file(filename) for filename in files]
+
+    args = ['test']
+    args.extend(files)
+
+    expected = """\
+test/data/does-not-exist.arc
+"""
+
+    value = helper(args, 0)
+    assert remove_before_test_data(value) == expected
+
+
+def test_digests():
+    # needed for test coverage
+    files = ['example-digest-bad.warc', 'example.warc']
+    [run_one(map_test_file(filename)) for filename in files]
+
+
+def test_leftovers():
+    commentary = warcio.tester.Commentary('id', 'type')
+    assert not commentary.has_comments()
+
+    # hard to test because invalid WARC Content-Length raises in archiveiterator
+    warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None)
+
+    # hard to test because warcio raises for unknown WARC version
+    warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
+
+    expected = '''\
+error: Must be an integer: Content-Length not-an-integer
+'''
+
+    assert '\n'.join(commentary.comments())+'\n' == expected
diff --git a/test/test_tests.py b/test/test_tests.py
deleted file mode 100644
index 200df8ae..00000000
--- a/test/test_tests.py
+++ /dev/null
@@ -1,348 +0,0 @@
-from warcio.cli import main
-from warcio.utils import to_native_str
-import warcio.tester
-
-from . import get_test_file
-from .test_cli import patch_stdout
-
-
-file_map = {}
-
-
-def map_test_file(filename):
-    file_map[filename] = get_test_file(filename)
-    return file_map[filename]
-
-
-def helper(args, expected_exit_value):
-    with patch_stdout() as buff:
-        exit_value = None
-        try:
-            main(args=args)
-        except SystemExit as e:
-            exit_value = e.code
-        finally:
-            assert exit_value == expected_exit_value
-
-        return to_native_str(buff.getvalue())
-
-
-def remove_before_test_data(s):
-    ret = ''
-    for line in s.splitlines(True):
-        for filename, value in file_map.items():
-            if value in line:
-                line = line.replace(value, 'test/data/' + filename)
-        ret += line
-    return ret
-
-
-def test_torture_validate_record():
-    files = ['standard-torture-validate-record.warc']
-    files = [map_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/standard-torture-validate-record.warc
-  WARC-Record-ID None
-    WARC-Type warcinfo
-    digest not present
-    error: uri must be within <>: WARC-Refers-To probhibited
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Record-ID
-    error: field not allowed in record type: warcinfo WARC-Refers-To
-    error: warc-fields contains invalid utf-8: 'utf-8' codec can't decode byte 0xc3 in position 57: invalid continuation byte
-    comment: The first line of warc-fields cannot start with whitespace
-    comment: warc-fields lines must end with \\r\\n: test: lines should end with \\r\\n
-    comment: Missing colon in warc-fields line: no colon
-    comment: Invalid warc-fields name: token cannot have a space
-  WARC-Record-ID <uri:uuid:test-empty-warc-fields>
-    WARC-Type warcinfo
-    digest not present
-    error: missing required header: WARC-Date
-    comment: warc-fields block present but empty
-  WARC-Record-ID <uri:uuid:test-warcinfo-non-recommended-content-type>
-    WARC-Type warcinfo
-    digest not present
-    error: missing required header: WARC-Date
-    recommendation: warcinfo Content-Type recommended to be application/warc-fields: not-application/warc-fields
-  WARC-Record-ID <uri:uuid:test-response-content-type>
-    WARC-Type response
-    digest not present
-    error: missing required header: WARC-Date
-    error: responses for http/https should have Content-Type of application/http; msgtype=response or application/http: text/plain
-    error: WARC-IP-Address should be used for http and https responses
-    error: http/https responses should have http headers
-  WARC-Record-ID <uri:uuid:test-resource-dns-content-type>
-    WARC-Type resource
-    digest not present
-    error: missing required header: WARC-Date
-    error: resource records for dns shall have Content-Type of text/dns: text/plain
-  WARC-Record-ID <uri:uuid:test-resource-dns-empty>
-    WARC-Type resource
-    digest not present
-    error: missing required header: WARC-Date
-    comment: unknown field, no validation performed: WARC-Test-TODO add another with valid block
-  WARC-Record-ID <uri:uuid:test-resource-not-dns>
-    WARC-Type resource
-    digest not present
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-  WARC-Record-ID <uri:uuid:test-request-content-type>
-    WARC-Type request
-    digest not present
-    error: missing required header: WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
-    error: WARC-IP-Address should be used for http and https requests
-  WARC-Record-ID <uri:uuid:test-request-content-type-with-ip>
-    WARC-Type request
-    digest not present
-    error: missing required header: WARC-Date
-    error: requests for http/https should have Content-Type of application/http; msgtype=request or application/http: text/plain
-  WARC-Record-ID <uri:uuid:test-metadata-warc-fields-empty>
-    WARC-Type metadata
-    digest not present
-    error: missing required header: WARC-Date
-    comment: warc-fields block present but empty
-  WARC-Record-ID <uri:uuid:test-metadata-not-warc-fields>
-    WARC-Type metadata
-    digest not present
-    error: missing required header: WARC-Date
-  WARC-Record-ID <uri:uuid:test-revisit-profile-unknown>
-    WARC-Type revisit
-    digest not present
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Target-URI
-    comment: unknown value, perhaps an extension: WARC-Profile none
-    comment: no revisit details validation done due to unknown profile: none
-  WARC-Record-ID <uri:uuid:test-revisit-profile-future>
-    WARC-Type revisit
-    digest not present
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Target-URI
-    error: missing required header: WARC-Payload-Digest
-    recommendation: missing recommended header: WARC-Refers-To
-    recommendation: missing recommended header: WARC-Refers-To-Date
-    recommendation: missing recommended header: WARC-Refers-To-Target-URI
-    comment: WARC-Profile value is for a different version: 1.0 http://netpreserve.org/warc/1.1/revisit/identical-payload-digest
-  WARC-Record-ID <uri:uuid:test-revisit-profile-good>
-    WARC-Type revisit
-    digest not present
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Target-URI
-    recommendation: missing recommended header: WARC-Refers-To
-    recommendation: missing recommended header: WARC-Refers-To-Date
-  WARC-Record-ID <uri:uuid:test-conversion>
-    WARC-Type conversion
-    digest not present
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Target-URI
-  WARC-Record-ID <uri:uuid:test-continuation-segment-1>
-    WARC-Type continuation
-    digest not present
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Segment-Origin-ID
-    error: missing required header: WARC-Target-URI
-    error: continuation record must have WARC-Segment-Number > 1: 1
-    comment: warcio test continuation code has not been tested, expect bugs
-  WARC-Record-ID <uri:uuid:test-continuation-segment-valid>
-    WARC-Type continuation
-    digest not present
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Segment-Origin-ID
-    error: missing required header: WARC-Target-URI
-    comment: warcio test continuation code has not been tested, expect bugs
-"""
-
-    value = helper(args, 0)
-    print(remove_before_test_data(value))
-
-    actual = remove_before_test_data(value)
-
-    assert actual == expected
-
-
-def test_torture_validate_field():
-    files = ['standard-torture-validate-field.warc']
-    files = [map_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/standard-torture-validate-field.warc
-  WARC-Record-ID <urn:uuid:torture-validate-field>
-    WARC-Type does-not-exist
-    unknown hash algorithm name in block digest
-    error: uri must not be within <>: WARC-Target-URI <http://example.com/>
-    error: invalid uri scheme, bad character: WARC-Target-URI <http://example.com/>
-    error: duplicate field seen: WARC-Target-URI example.com
-    error: invalid uri, no scheme: WARC-Target-URI example.com
-    error: duplicate field seen: WARC-Target-URI ex ample.com
-    error: invalid uri, no scheme: WARC-Target-URI ex ample.com
-    error: invalid uri, contains whitespace: WARC-Target-URI ex ample.com
-    error: invalid uri scheme, bad character: WARC-Target-URI ex ample.com
-    error: duplicate field seen: WARC-Target-URI h<>ttp://example.com/
-    error: invalid uri scheme, bad character: WARC-Target-URI h<>ttp://example.com/
-    error: duplicate field seen: WARC-Type CAPITALIZED
-    error: uri must be within <>: WARC-Concurrent-To http://example.com/
-    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
-    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
-    error: WARC versions <= 1.0 may not have timestamps with fractional seconds: WARC-Date 2017-03-06T04:03:53.Z
-    error: must contain a /: Content-Type asdf
-    error: invalid subtype: Content-Type asdf
-    error: duplicate field seen: Content-Type has space/asdf
-    error: invalid type: Content-Type has space/asdf
-    error: duplicate field seen: Content-Type asdf/has space
-    error: invalid subtype: Content-Type asdf/has space
-    error: duplicate field seen: Content-Type asdf/has space;asdf
-    error: invalid subtype: Content-Type asdf/has space;asdf
-    error: missing algorithm: WARC-Block-Digest asdf
-    error: duplicate field seen: WARC-Block-Digest has space:asdf
-    error: invalid algorithm: WARC-Block-Digest has space:asdf
-    error: duplicate field seen: WARC-Block-Digest sha1:&$*^&*^#*&^
-    error: invalid ip: WARC-IP-Address 1.2.3.4.5
-    error: uri must be within <>: WARC-Warcinfo-ID asdf:asdf
-    error: duplicate field seen: WARC-Profile http://netpreserve.org/warc/1.0/revisit/identical-payload-digest
-    error: must contain a /: WARC-Identified-Payload-Type asdf
-    error: invalid subtype: WARC-Identified-Payload-Type asdf
-    error: uri must be within <>: WARC-Segment-Origin-ID http://example.com
-    error: must be an integer: WARC-Segment-Number not-an-integer
-    error: duplicate field seen: WARC-Segment-Number 0
-    error: must be 1 or greater: WARC-Segment-Number 0
-    error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 0
-    error: duplicate field seen: WARC-Segment-Number 1
-    error: duplicate field seen: WARC-Segment-Number 2
-    error: non-continuation records must always have WARC-Segment-Number: 1: WARC-Segment-Number 2
-    error: duplicate field seen: WARC-Segment-Total-Length not-an-integer
-    error: must be an integer: WARC-Segment-Total-Length not-an-integer
-    error: Invalid timestamp: WARC-Refers-To-Date not-a-date
-    comment: unknown WARC-Type: WARC-Type does-not-exist
-    comment: WARC-Type is not lower-case: WARC-Type CAPITALIZED
-    comment: unknown WARC-Type: WARC-Type CAPITALIZED
-    comment: unknown digest algorithm: WARC-Block-Digest asdf
-    comment: Invalid-looking digest value: WARC-Block-Digest sha1:&$*^&*^#*&^
-    comment: unknown value, perhaps an extension: WARC-Truncated invalid
-    comment: unknown value, perhaps an extension: WARC-Profile asdf
-    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com
-    comment: field was introduced after this warc version: 1.0 WARC-Refers-To-Date not-a-date
-    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-Filename asdf
-    comment: This Heretrix extension never made it into the standard: WARC-Refers-To-File-Offset 1234
-    comment: unknown field, no validation performed: WARC-Unknown-Field asdf
-  WARC-Record-ID None
-    WARC-Type invalid
-    digest not present
-    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.Z
-    error: Invalid timestamp: WARC-Date 2017-03-06T04:03:53.Z
-    error: duplicate field seen: WARC-Date 2017-03-06T04:03:53.0Z
-    comment: unknown WARC-Type: WARC-Type invalid
-  WARC-Record-ID None
-    WARC-Type request
-    digest not present
-    error: segmented records must have both WARC-Segment-Number and WARC-Segment-Origin-ID
-    error: missing required header: Content-Type
-    error: missing required header: WARC-Date
-    error: missing required header: WARC-Record-ID
-    error: missing required header: WARC-Target-URI
-    recommendation: do not segment WARC-Type request
-global warcinfo checks
-  comment: WARC-Warcinfo-ID not found: <urn:uuid:torture-validate-field> WARC-Warcinfo-ID asdf:asdf
-global Concurrent-To checks
-  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To <uri:urn:asdf-asdf-asdf>
-  comment: WARC-Concurrent-To not found: <urn:uuid:torture-validate-field> WARC-Concurrent-To http://example.com/
-"""
-
-    value = helper(args, 0)
-    actual = remove_before_test_data(value)
-
-    print(actual)
-    assert actual == expected
-
-
-def test_arc():
-    files = ['does-not-exist.arc']
-    files = [map_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/does-not-exist.arc
-"""
-
-    value = helper(args, 0)
-    assert remove_before_test_data(value) == expected
-
-
-def test_digests():
-    # needed for test coverage
-    files = ['example-digest-bad.warc', 'example.warc']
-    files = [map_test_file(filename) for filename in files]
-
-    args = ['test']
-    args.extend(files)
-
-    expected = """\
-test/data/example-digest-bad.warc
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    payload digest failed: sha1:1112H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ
-    error: WARC-IP-Address should be used for http and https requests
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest pass
-    error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest pass
-    error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest pass
-    error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-test/data/example.warc
-  WARC-Record-ID <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest not present
-    error: WARC-IP-Address should be used for http and https requests
-    error: Duplicate WARC-Record-ID: <urn:uuid:a9c5c23a-0221-11e7-8fe3-0242ac120007> found in files test/data/example.warc test/data/example-digest-bad.warc
-  WARC-Record-ID <urn:uuid:e6e395ca-0221-11e7-a18d-0242ac120005>
-    WARC-Type revisit
-    digest present but not checked (revisit)
-    recommendation: Missing recommended header: WARC-Refers-To
-    comment: This Heretrix extension never made it into the standard: WARC-Profile http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest
-    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Target-URI http://example.com/
-    comment: Field was introduced after this warc version: 1.0 WARC-Refers-To-Date 2017-03-06T04:02:06Z
-  WARC-Record-ID <urn:uuid:e6e41fea-0221-11e7-8fe3-0242ac120007>
-    WARC-Type request
-    digest not present
-    error: WARC-IP-Address should be used for http and https requests
-"""
-
-    value = helper(args, 0)
-    assert remove_before_test_data(value) == expected
-
-
-def test_leftovers():
-    commentary = warcio.tester.Commentary('id', 'type')
-    assert not commentary.has_comments()
-
-    # hard to test because invalid WARC Content-Length raises in archiveiterator
-    warcio.tester.validate_content_length('Content-Length', 'not-an-integer', None, '1.0', commentary, None)
-
-    # hard to test because warcio raises for unknown WARC version
-    warcio.tester.validate_profile('blah', 'blah', None, '999', commentary, None)
-
-    expected = '''\
-error: Must be an integer: Content-Length not-an-integer
-'''
-
-    assert '\n'.join(commentary.comments())+'\n' == expected

From b61878e28058ba3b1aa5a6363717f781dd4b0995 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Thu, 4 Apr 2019 15:01:31 -0700
Subject: [PATCH 66/68] wip

---
 test/test_tester.py       |  2 +-
 warcio/archiveiterator.py |  5 ++-
 warcio/bufferedreaders.py | 49 +++++++++++++------------
 warcio/recordloader.py    | 43 ++++++++++++++++++----
 warcio/tester.py          | 76 +++++++++------------------------------
 5 files changed, 82 insertions(+), 93 deletions(-)

diff --git a/test/test_tester.py b/test/test_tester.py
index 49b1cc6d..08963ea9 100644
--- a/test/test_tester.py
+++ b/test/test_tester.py
@@ -80,7 +80,7 @@ def test_digests():
 
 
 def test_leftovers():
-    commentary = warcio.tester.Commentary('id', 'type')
+    commentary = warcio.recordloader.Commentary()
     assert not commentary.has_comments()
 
     # hard to test because invalid WARC Content-Length raises in archiveiterator
diff --git a/warcio/archiveiterator.py b/warcio/archiveiterator.py
index 5e9c02ca..24094936 100644
--- a/warcio/archiveiterator.py
+++ b/warcio/archiveiterator.py
@@ -56,14 +56,13 @@ class ArchiveIterator(six.Iterator):
     def __init__(self, fileobj, no_record_parse=False,
                  verify_http=False, arc2warc=False,
                  ensure_http_headers=False, block_size=BUFF_SIZE,
-                 check_digests=False, fixup_bugs=True, raise_exceptions=False):
+                 check_digests=False, fixup_bugs=True):
 
         self.fh = fileobj
 
         self.loader = ArcWarcRecordLoader(verify_http=verify_http,
                                           arc2warc=arc2warc,
-                                          fixup_bugs=fixup_bugs,
-                                          raise_exceptions=raise_exceptions)
+                                          fixup_bugs=fixup_bugs)
         self.known_format = None
 
         self.mixed_arc_warc = arc2warc
diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py
index 74adae51..f60ae1a5 100644
--- a/warcio/bufferedreaders.py
+++ b/warcio/bufferedreaders.py
@@ -36,12 +36,6 @@ def brotli_decompressor():
         pass
 
 
-#=================================================================
-class DecompressionException(Exception):
-    def __init__(self, msg):
-        Exception.__init__(self, msg)
-
-
 #=================================================================
 class BufferedReader(object):
     """
@@ -71,7 +65,7 @@ def __init__(self, stream, block_size=BUFF_SIZE,
                  decomp_type=None,
                  starting_data=None,
                  read_all_members=False,
-                 raise_exceptions=False):
+                 commentary=None):
 
         self.stream = stream
         self.block_size = block_size
@@ -84,7 +78,7 @@ def __init__(self, stream, block_size=BUFF_SIZE,
         self.buff_size = 0
 
         self.read_all_members = read_all_members
-        self.raise_exceptions = raise_exceptions
+        self.commentary = commentary
 
     def set_decomp(self, decomp_type):
         self._init_decomp(decomp_type)
@@ -96,6 +90,10 @@ def _init_decomp(self, decomp_type):
                 self.decomp_type = decomp_type
                 self.decompressor = self.DECOMPRESSORS[decomp_type.lower()]()
             except KeyError:
+                # XXX don't raise?
+                # we don't know if the enduser cares or not
+                # or the record might actually be uncompressed
+                # XXX what does pywb do
                 raise Exception('Decompression type not supported: ' +
                                 decomp_type)
         else:
@@ -150,8 +148,8 @@ def _decompress(self, data):
                         self._init_decomp('deflate_alt')
                         data = self._decompress(data)
                     else:
-                        if self.raise_exceptions:
-                            raise DecompressionException(str(e))
+                        if self.commentary:
+                            self.commentary.comment('Payload claimed to be compressed but apparently is not')
                         self.decompressor = None
                 # otherwise (partly decompressed), something is wrong
                 else:
@@ -290,40 +288,43 @@ class ChunkedDataReader(BufferedReader):
     If at any point the chunked header is not available, the stream is
     assumed to not be chunked and no more dechunking occurs.
     """
-    def __init__(self, stream, **kwargs):
+    def __init__(self, stream, raise_exceptions=False, commentary=None, **kwargs):
         super(ChunkedDataReader, self).__init__(stream, **kwargs)
         self.all_chunks_read = False
-        self.not_chunked = False
-
-        # if False, we'll use best-guess fallback for parse errors
-        self.raise_chunked_data_exceptions = kwargs.get('raise_exceptions')
+        self.not_actually_chunked = False
+        self.at_start = True
+        self.raise_chunked_data_exceptions = raise_exceptions
+        self.commentary = commentary
 
     def _fillbuff(self, block_size=None):
-        if self.not_chunked:
+        if self.not_actually_chunked:
             return super(ChunkedDataReader, self)._fillbuff(block_size)
 
         # Loop over chunks until there is some data (not empty())
         # In particular, gzipped data may require multiple chunks to
         # return any decompressed result
-        while (self.empty() and
-               not self.all_chunks_read and
-               not self.not_chunked):
-
+        while (self.empty() and not self.all_chunks_read):
             try:
                 length_header = self.stream.readline(64)
                 self._try_decode(length_header)
+                self.at_start = False
             except ChunkedDataException as e:
                 if self.raise_chunked_data_exceptions:
                     raise
-
                 # Can't parse the data as chunked.
                 # It's possible that non-chunked data is served
                 # with a Transfer-Encoding: chunked.
                 # Treat this as non-chunk encoded from here on.
+                if self.commentary:
+                    if self.at_start:
+                        self.commentary.comment('Buffer claimed to be chunked, but was not from the start')
+                    else:
+                        self.commentary.comment('Buffer is chunked but there was an unchunking error midway')
                 self._process_read(length_header + e.data)
-                self.not_chunked = True
+                self.not_actually_chunked = True
+                self.at_start = False
 
-                # parse as block as non-chunked
+                # parse as non-chunked
                 return super(ChunkedDataReader, self)._fillbuff(block_size)
 
     def _try_decode(self, length_header):
@@ -365,6 +366,8 @@ def _try_decode(self, length_header):
                     msg = 'Ran out of data before end of chunk'
                     raise ChunkedDataException(msg, data)
                 else:
+                    if self.commentary:
+                        self.commentary.comment('Chunked reader ran out of data before end of chunk')
                     chunk_size = data_len
                     self.all_chunks_read = True
 
diff --git a/warcio/recordloader.py b/warcio/recordloader.py
index d5523f75..729cc3c6 100644
--- a/warcio/recordloader.py
+++ b/warcio/recordloader.py
@@ -16,6 +16,36 @@
 logger = logging.getLogger(__name__)
 
 
+#=================================================================
+class Commentary(object):
+    def __init__(self):
+        self.errors = []
+        self.recommendations = []
+        self._comments = []
+
+    def error(self, *args):
+        self.errors.append(args)
+
+    def recommendation(self, *args):
+        self.recommendations.append(args)
+
+    def comment(self, *args):
+        self._comments.append(args)
+
+    def has_comments(self):
+        if self.errors or self.recommendations or self._comments:
+            return True
+
+    def comments(self):
+        # XXX str() all of these, in case an int or other thing slips in?
+        for e in self.errors:
+            yield 'error: ' + ' '.join(e)
+        for r in self.recommendations:
+            yield 'recommendation: ' + ' '.join(r)
+        for c in self._comments:
+            yield 'comment: ' + ' '.join(c)
+
+
 #=================================================================
 class ArcWarcRecord(object):
     def __init__(self, *args, **kwargs):
@@ -23,7 +53,7 @@ def __init__(self, *args, **kwargs):
          self.http_headers, self.content_type, self.length) = args
         self.payload_length = -1
         self.digest_checker = kwargs.get('digest_checker')
-        self.raise_exceptions = kwargs.get('raise_exceptions')
+        self.commentary = kwargs.get('commentary')
         self._content_stream = None
 
     def content_stream(self):
@@ -42,9 +72,9 @@ def content_stream(self):
                 encoding = None
 
         if self.http_headers.get_header('transfer-encoding') == 'chunked':
-            self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
+            self._content_stream = ChunkedDataReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary)
         elif encoding:
-            self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, raise_exceptions=self.raise_exceptions)
+            self._content_stream = BufferedReader(self.raw_stream, decomp_type=encoding, commentary=self.commentary)
         else:
             self._content_stream = self.raw_stream
 
@@ -65,7 +95,7 @@ class ArcWarcRecordLoader(object):
     NON_HTTP_SCHEMES = ('dns:', 'whois:', 'ntp:')
     HTTP_SCHEMES = ('http:', 'https:')
 
-    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_exceptions=False):
+    def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True):
         if arc2warc:
             self.arc_parser = ARC2WARCHeadersParser()
         else:
@@ -76,7 +106,6 @@ def __init__(self, verify_http=True, arc2warc=True, fixup_bugs=True, raise_excep
 
         self.http_req_parser = StatusAndHeadersParser(self.HTTP_VERBS, verify_http)
         self.fixup_bugs = fixup_bugs
-        self.raise_exceptions = raise_exceptions
 
     def parse_record_stream(self, stream,
                             statusline=None,
@@ -134,6 +163,7 @@ def parse_record_stream(self, stream,
 
         is_verifying = False
         digest_checker = DigestChecker(check_digests)
+        commentary = Commentary()
 
         # limit stream to the length for all valid records
         if length is not None and length >= 0:
@@ -158,7 +188,8 @@ def parse_record_stream(self, stream,
 
         return ArcWarcRecord(the_format, rec_type,
                              rec_headers, stream, http_headers,
-                             content_type, length, digest_checker=digest_checker, raise_exceptions=self.raise_exceptions)
+                             content_type, length, digest_checker=digest_checker,
+                             commentary=commentary)
 
     def wrap_digest_verifying_stream(self, stream, rec_type, rec_headers, digest_checker, length=None):
         payload_digest = rec_headers.get_header('WARC-Payload-Digest')
diff --git a/warcio/tester.py b/warcio/tester.py
index 84ea75c3..cee5344f 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -8,45 +8,8 @@
 from warcio.archiveiterator import WARCIterator
 from warcio.utils import to_native_str, Digester
 from warcio.exceptions import ArchiveLoadFailed
-from warcio.bufferedreaders import ChunkedDataException, DecompressionException
-
-
-class Commentary(object):
-    def __init__(self, record_id=None, rec_type=None):
-        self._record_id = record_id
-        self._rec_type = rec_type
-        self.errors = []
-        self.recommendations = []
-        self._comments = []
-
-    def record_id(self):
-        return self._record_id
-
-    def rec_type(self):
-        return self._rec_type
-
-    def error(self, *args):
-        self.errors.append(args)
-
-    def recommendation(self, *args):
-        self.recommendations.append(args)
-
-    def comment(self, *args):
-        self._comments.append(args)
-
-    def has_comments(self):
-        if self.errors or self.recommendations or self._comments:
-            return True
-
-    def comments(self):
-        # XXX str() all of these, in case an int or other thing slips in?
-        for e in self.errors:
-            yield 'error: ' + ' '.join(e)
-        for r in self.recommendations:
-            yield 'recommendation: ' + ' '.join(r)
-        for c in self._comments:
-            yield 'comment: ' + ' '.join(c)
-
+from warcio.bufferedreaders import ChunkedDataException
+from warcio.recordloader import Commentary
 
 class WrapRecord(object):
     def __init__(self, obj):
@@ -662,9 +625,7 @@ def validate_record_against_rec_type(config, record, commentary, pending):
 def validate_record(record):
     version = record.rec_headers.protocol.split('/', 1)[1]  # XXX not exported
 
-    record_id = record.rec_headers.get_header('WARC-Record-ID')
-    rec_type = record.rec_headers.get_header('WARC-Type')
-    commentary = Commentary(record_id=record_id, rec_type=rec_type)
+    commentary = record.commentary
     pending = None
 
     seen_fields = set()
@@ -683,6 +644,7 @@ def validate_record(record):
         if 'validate' in config:
             config['validate'](field, value, record, version, commentary, pending)
 
+    rec_type = record.rec_headers.get_header('WARC-Type')
     if rec_type not in record_types:
         # we print a comment for this elsewhere
         pass
@@ -839,37 +801,31 @@ def _process_one(warcfile, all_records, concurrent_to, verbose):
     if warcfile.endswith('.arc') or warcfile.endswith('.arc.gz'):
         return
     with open(warcfile, 'rb') as stream:
-        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False, raise_exceptions=True):
-        #for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
-
+        for record in WARCIterator(stream, check_digests=True, fixup_bugs=False):
             record = WrapRecord(record)
             digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
                               record.rec_headers.get_header('WARC-Block-Digest'))
+            record_id = record.rec_headers.get_header('WARC-Record-ID')
+            rec_type = record.rec_headers.get_header('WARC-Type')
 
-            commentary = validate_record(record)
-            save_global_info(record, warcfile, commentary, all_records, concurrent_to)
+            validate_record(record)
+            record.stream_for_digest_check()
 
-            try:
-                record.stream_for_digest_check()
-            except ChunkedDataException as e:
-                commentary.comment('Transfer-Encoding: chunked, saw exception: '+str(e))
-                pass
-            except DecompressionException as e:
-                commentary.comment('Content-Encoding indicates compression, saw: '+str(e))
-                pass
+            commentary = record.commentary
+            save_global_info(record, warcfile, commentary, all_records, concurrent_to)
 
             if verbose or commentary.has_comments() or record.digest_checker.passed is False:
-                print(' ', 'WARC-Record-ID', commentary.record_id())
-                print('   ', 'WARC-Type', commentary.rec_type())
-
+                print(' ', 'WARC-Record-ID', record_id)
+                print('   ', 'WARC-Type', rec_type)
                 if record.digest_checker.passed is True:
                     print('    digest pass')
                 elif record.digest_checker.passed is None:
                     if digest_present:
-                        if commentary.rec_type() == 'revisit':
+                        if rec_type == 'revisit':
                             print('    digest present but not checked (revisit)')
                         else:  # pragma: no cover
-                            # WARC record missing Content-Length: header, which is verboten
+                            # should not ever happen
+                            # example reason: WARC record missing Content-Length: header, but that case raises
                             print('    digest present but not checked')
                     else:
                         print('    digest not present')

From 2d2b7d560f4bccf694faf1a5eb7b36c020cbfdd6 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Mon, 9 Sep 2019 11:03:30 -0700
Subject: [PATCH 67/68] tests pass

---
 test/test_check_digest_examples.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test/test_check_digest_examples.py b/test/test_check_digest_examples.py
index 679d7d24..89eb296f 100644
--- a/test/test_check_digest_examples.py
+++ b/test/test_check_digest_examples.py
@@ -9,7 +9,8 @@
         'example-iana.org-chunked.warc',
         'example-wrong-chunks.warc.gz',
         'example-bad-non-chunked.warc.gz',
-        'example-digest.warc'
+        'example-digest-bad.warc',
+        'standard-torture-validate-field.warc',
        ]
 
 
@@ -34,7 +35,7 @@ def check_helper(self, args, expected_exit_value, capsys):
         return capsys.readouterr()[0]  # list for py33 support
 
     def test_check_invalid(self, capsys):
-        filenames = [get_test_file('example-digest.warc')]
+        filenames = [get_test_file('example-digest-bad.warc')]
 
         args = ['check'] + filenames
         value = self.check_helper(args, 1, capsys)

From fc19c7d632e9440d88e6858db642686673506f20 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <lindahl@pbm.com>
Date: Sun, 16 Feb 2020 13:49:08 -0800
Subject: [PATCH 68/68] comments

---
 warcio/tester.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/warcio/tester.py b/warcio/tester.py
index cee5344f..2fc8ff9b 100644
--- a/warcio/tester.py
+++ b/warcio/tester.py
@@ -93,7 +93,10 @@ def validate_warc_fields(record, commentary):
         commentary.comment('warc-fields block present but empty')
         return
 
-    # check known fields
+    # XXX check known fields
+    # warcinfo "but not limited to"
+    # metadata lacks that langauge
+    # https://github.com/iipc/warc-specifications/issues/7
 
 
 def validate_warcinfo(record, commentary, pending):
@@ -110,7 +113,7 @@ def validate_warcinfo(record, commentary, pending):
         #     comment if http-header-from here and in the request?
         validate_warc_fields(record, commentary)
 
-    # whole-file tests:
+    # XXX whole-file tests:
     # recommended that all files start with warcinfo
     # elsewise allowable for warcinfo to appear anywhere
 
@@ -152,6 +155,7 @@ def validate_response(record, commentary, pending):
                     if int(http_content_length) != record.raw_stream.stream.limit:
                         commentary.comment('Actual http payload length is different from http header Content-Length:',
                                            str(record.raw_stream.stream.limit), http_content_length)
+        # XXX can we say something useful if we are unable to check this length? why would it fail?
 
 
 def validate_resource(record, commentary, pending):