Skip to content

Commit 541ff72

Browse files
authored
Merge pull request #313 from edgi-govdata-archiving/310-the-encoding-was-a-lie
Fix a variety of decoding failures in the diff server
2 parents eab7e95 + 158d0a6 commit 541ff72

File tree

5 files changed

+62
-2
lines changed

5 files changed

+62
-2
lines changed

web_monitoring/diffing_server.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,16 +307,37 @@ def _extract_encoding(headers, content):
307307
prolog_match = XML_PROLOG_PATTERN.search(content, endpos=2048)
308308
if prolog_match:
309309
encoding = prolog_match.group(1).decode('ascii', errors='ignore')
310+
# Handle common mistakes and errors in encoding names
311+
if encoding == 'iso-8559-1':
312+
encoding = 'iso-8859-1'
313+
# Windows-1252 is so commonly mislabeled, WHATWG recommends assuming it's a
314+
# mistake: https://encoding.spec.whatwg.org/#names-and-labels
315+
if encoding == 'iso-8859-1' and 'html' in content_type:
316+
encoding = 'windows-1252'
310317
return encoding
311318

312319

313320
def _decode_body(response, name, raise_if_binary=True):
314321
encoding = _extract_encoding(response.headers, response.body) or 'UTF-8'
315-
text = response.body.decode(encoding, errors='replace')
322+
try:
323+
text = response.body.decode(encoding, errors='replace')
324+
except LookupError:
325+
# If the encoding we found isn't known, fall back to ascii
326+
text = response.body.decode('ascii', errors='replace')
327+
328+
text_length = len(text)
329+
if text_length == 0:
330+
return text
331+
332+
# Replace null terminators; some differs (especially those written in C)
333+
# don't handle them well in the middle of a string.
334+
text = text.replace('\u0000', '\ufffd')
335+
316336
# If a significantly large portion of the document was totally undecodable,
317337
# it's likely this wasn't text at all, but binary data.
318-
if raise_if_binary and text.count('\ufffd') / len(text) > 0.25:
338+
if raise_if_binary and text.count('\ufffd') / text_length > 0.25:
319339
raise UndecodableContentError(f'The response body of `{name}` could not be decoded as {encoding}.')
340+
320341
return text
321342

322343

web_monitoring/tests/fixtures/empty.txt

Whitespace-only changes.
27 Bytes
Binary file not shown.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
2+
<html>
3+
<head>
4+
<meta http-equiv="refresh" content="60; url=http://radar2pub.bldr.ncep.noaa.gov">
5+
<meta http-equiv="Content-Type" content="text/html; charset=who-knows-what-i-am">
6+
<meta name="Author" content="David S Alden (GMSI)">
7+
<title>NWS Level II Radar Receive Status</title>
8+
<link rel="stylesheet" type="text/css" href="mon.css">
9+
</head>
10+
<body>
11+
<font face="arial,helvetica" size=+2><b>NWS Level II Radar Receive Status as of Tue Jan 17 14:14:06 UTC 2017</b></font><br>
12+
<br>Key: Green=Up (Lvl2&lt;5 min); Yellow=Warning (5&lt;=Lvl2&lt;30 min); Orange=Down (Lvl2&amp;Lvl3&gt;10 min); Red=Down (Lvl2&gt;=30 min)<br>
13+
SiteID: Black=Ok (-1&lt;=Latency&lt;=60 sec); White=Anomaly (-1&gt;Latency&gt;60 sec)<br>
14+
SiteCodes: 01=Legacy Msg1; 02=Legacy Msg31; 03=Super-Res; 04=Recombined; 05=DP w/o SuperRes; 06=DP w Super Res; 07=DP Recombined<br><br>
15+
<p><b><font face="arial,helvetica">Eastern Region Radar Sites - Last receipt of data</font></b>
16+
<table BORDER=0 CELLSPACING=4><tr>
17+
<td ALIGN=CENTER BGCOLOR="#33FF33" class=black id=blacklink TITLE="Wakefield VA" VALIGN=middle><b><A HREF="site/kakq.html" TARGET="_blank">KAKQ</a></b><span class=black>06<br>14:13:02</span></td>
18+
<td ALIGN=CENTER BGCOLOR="#33FF33" class=black id=blacklink TITLE="Binghamton NY" VALIGN=middle><b><A HREF="site/kbgm.html" TARGET="_blank">KBGM</a></b><span class=black>06<br>14:13:03</span></td>
19+
<td ALIGN=CENTER BGCOLOR="#FF0000" class=black id=blacklink TITLE="Boston MA" VALIGN=middle><b><A HREF="site/kbox.html" TARGET="_blank">KBOX</a></b><span class=black>06<br>13:12:47</span></td>
20+
<td ALIGN=CENTER BGCOLOR="#33FF33" class=black id=blacklink TITLE="Buffalo NY" VALIGN=middle><b><A HREF="site/kbuf.html" TARGET="_blank">KBUF</a></b><span class=black>06<br>14:13:04</span></td>
21+
</tr></table>
22+
<font face="Arial,Helvetica">44 sites up (97.8%) of 45 radar sites monitored</font>
23+
<p><font face="arial,helvetica"> 151 sites up (96%) of 158 total radar sites monitored</font>
24+
</body>
25+
</html>

web_monitoring/tests/test_diffing_server_exc_handling.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,10 @@ def test_cors_origin_header(self):
188188
'Origin': 'http://two.com'})
189189
assert response.headers.get('Access-Control-Allow-Origin') == 'http://two.com'
190190

191+
def test_decode_empty_bodies(self):
192+
response = mock_tornado_request('empty.txt')
193+
df._decode_body(response, 'a')
194+
191195
def test_poorly_encoded_content(self):
192196
response = mock_tornado_request('poorly_encoded_utf8.txt')
193197
df._decode_body(response, 'a')
@@ -204,6 +208,16 @@ def test_fetch_undecodable_content(self):
204208
self.json_check(response)
205209
assert response.code == 422
206210

211+
def test_treats_unknown_encoding_as_ascii(self):
212+
response = mock_tornado_request('unknown_encoding.html')
213+
df._decode_body(response, 'a')
214+
215+
def test_diff_content_with_null_bytes(self):
216+
response = self.fetch('/html_source_dmp?format=json&'
217+
f'a=file://{fixture_path("has_null_byte.txt")}&'
218+
f'b=file://{fixture_path("has_null_byte.txt")}')
219+
assert response.code == 200
220+
207221

208222
def mock_diffing_method(c_body):
209223
return

0 commit comments

Comments
 (0)