Skip to content

Commit 9ee38c1

Browse files
style: fix ruff formatting issues in spike investigation files
- Apply ruff formatting to test_gzip_utf8_issue.py - Apply ruff formatting to fix_gzip_parser_selection.py - Ensure code style compliance for CI checks Co-Authored-By: unknown <>
1 parent 4518a4d commit 9ee38c1

File tree

2 files changed

+57
-55
lines changed

2 files changed

+57
-55
lines changed

fix_gzip_parser_selection.py

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@
2727
class ImprovedCompositeRawDecoder(CompositeRawDecoder):
2828
"""
2929
Enhanced CompositeRawDecoder with better GZIP detection and error handling.
30-
30+
3131
This addresses the StreamThreadException issue by:
3232
1. Auto-detecting GZIP content based on magic bytes
3333
2. Providing better error handling for decompression failures
3434
3. Falling back gracefully when parser selection fails
3535
"""
36-
36+
3737
def __init__(
3838
self,
3939
parser: Parser,
@@ -43,62 +43,62 @@ def __init__(
4343
) -> None:
4444
super().__init__(parser, stream_response, parsers_by_header)
4545
self._auto_detect_gzip = auto_detect_gzip
46-
46+
4747
def _detect_gzip_content(self, response: requests.Response) -> bool:
4848
"""
4949
Detect if response content is GZIP-compressed by checking magic bytes.
50-
50+
5151
Returns True if the response starts with GZIP magic number (0x1f, 0x8b).
5252
This helps identify GZIP content even when Content-Encoding header is missing.
5353
"""
5454
if not self._auto_detect_gzip:
5555
return False
56-
56+
5757
try:
58-
if hasattr(response, 'raw') and response.raw:
59-
current_pos = response.raw.tell() if hasattr(response.raw, 'tell') else None
60-
58+
if hasattr(response, "raw") and response.raw:
59+
current_pos = response.raw.tell() if hasattr(response.raw, "tell") else None
60+
6161
magic_bytes = response.raw.read(2)
62-
63-
if current_pos is not None and hasattr(response.raw, 'seek'):
62+
63+
if current_pos is not None and hasattr(response.raw, "seek"):
6464
response.raw.seek(current_pos)
65-
elif hasattr(response.raw, 'seek'):
65+
elif hasattr(response.raw, "seek"):
6666
response.raw.seek(0)
67-
68-
return len(magic_bytes) >= 2 and magic_bytes[0] == 0x1f and magic_bytes[1] == 0x8b
69-
70-
elif hasattr(response, 'content') and len(response.content) >= 2:
71-
return response.content[0] == 0x1f and response.content[1] == 0x8b
72-
67+
68+
return len(magic_bytes) >= 2 and magic_bytes[0] == 0x1F and magic_bytes[1] == 0x8B
69+
70+
elif hasattr(response, "content") and len(response.content) >= 2:
71+
return response.content[0] == 0x1F and response.content[1] == 0x8B
72+
7373
except Exception as e:
7474
logger.debug(f"Failed to detect GZIP content: {e}")
75-
75+
7676
return False
77-
77+
7878
def _select_parser(self, response: requests.Response) -> Parser:
7979
"""
8080
Enhanced parser selection with GZIP auto-detection.
81-
81+
8282
This method extends the base implementation to:
8383
1. Check Content-Encoding header (existing behavior)
8484
2. Auto-detect GZIP content by magic bytes
8585
3. Wrap parser with GzipParser if GZIP is detected
8686
"""
8787
selected_parser = super()._select_parser(response)
88-
89-
if (not isinstance(selected_parser, GzipParser) and
90-
self._detect_gzip_content(response)):
91-
92-
logger.info("Auto-detected GZIP content without Content-Encoding header, wrapping parser")
93-
88+
89+
if not isinstance(selected_parser, GzipParser) and self._detect_gzip_content(response):
90+
logger.info(
91+
"Auto-detected GZIP content without Content-Encoding header, wrapping parser"
92+
)
93+
9494
return GzipParser(inner_parser=selected_parser)
95-
95+
9696
return selected_parser
97-
97+
9898
def decode(self, response: requests.Response):
9999
"""
100100
Enhanced decode method with better error handling.
101-
101+
102102
Provides more informative error messages and graceful fallback
103103
when decompression or parsing fails.
104104
"""
@@ -113,14 +113,14 @@ def decode(self, response: requests.Response):
113113
f"Original error: {e}"
114114
)
115115
logger.error(error_msg)
116-
116+
117117
if self._auto_detect_gzip and self._detect_gzip_content(response):
118118
logger.info("Attempting recovery with GZIP decompression")
119119
gzip_parser = GzipParser(inner_parser=self.parser)
120-
121-
if hasattr(response, 'raw') and hasattr(response.raw, 'seek'):
120+
121+
if hasattr(response, "raw") and hasattr(response.raw, "seek"):
122122
response.raw.seek(0)
123-
123+
124124
try:
125125
if self.is_stream_response():
126126
response.raw.auto_close = False
@@ -131,7 +131,7 @@ def decode(self, response: requests.Response):
131131
return
132132
except Exception as recovery_error:
133133
logger.error(f"GZIP recovery failed: {recovery_error}")
134-
134+
135135
raise RuntimeError(error_msg) from e
136136
else:
137137
raise
@@ -143,22 +143,22 @@ def decode(self, response: requests.Response):
143143
def create_bing_ads_compatible_decoder() -> ImprovedCompositeRawDecoder:
144144
"""
145145
Create a CompositeRawDecoder configured for Bing Ads bulk streams.
146-
146+
147147
This decoder handles the campaign_labels stream and other bulk streams
148148
that use GZIP compression with CSV data.
149149
"""
150150
csv_parser = CsvParser(encoding="utf-8-sig", set_values_to_none=[""])
151-
151+
152152
gzip_parser = GzipParser(inner_parser=csv_parser)
153-
153+
154154
decoder = ImprovedCompositeRawDecoder.by_headers(
155155
parsers=[({"Content-Encoding"}, {"gzip"}, gzip_parser)],
156156
stream_response=True,
157157
fallback_parser=csv_parser,
158158
)
159-
159+
160160
decoder._auto_detect_gzip = True
161-
161+
162162
return decoder
163163

164164

test_gzip_utf8_issue.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,40 +22,42 @@
2222
def test_gzip_utf8_decoding_issue():
2323
"""
2424
Reproduce the issue where GZIP data is incorrectly treated as UTF-8.
25-
25+
2626
This simulates the scenario in Bing Ads campaign_labels stream where:
2727
1. Response contains GZIP-compressed CSV data
2828
2. Parser selection fails to detect GZIP content-encoding
2929
3. Compressed data is passed to UTF-8 decoder
3030
4. UTF-8 decoder fails with byte 0x8b error
3131
"""
3232
csv_data = "Account Id,Campaign,Client Id\n123,Test Campaign,456\n"
33-
34-
compressed_data = gzip.compress(csv_data.encode('utf-8'))
35-
36-
assert compressed_data[1] == 0x8b, f"Expected GZIP magic number 0x8b, got {hex(compressed_data[1])}"
37-
33+
34+
compressed_data = gzip.compress(csv_data.encode("utf-8"))
35+
36+
assert (
37+
compressed_data[1] == 0x8B
38+
), f"Expected GZIP magic number 0x8b, got {hex(compressed_data[1])}"
39+
3840
mock_response = Mock(spec=requests.Response)
3941
mock_response.content = compressed_data
4042
mock_response.raw = io.BytesIO(compressed_data)
4143
mock_response.headers = {} # Missing Content-Encoding: gzip header
42-
44+
4345
csv_parser = CsvParser(encoding="utf-8")
4446
decoder = CompositeRawDecoder(parser=csv_parser, stream_response=False)
45-
47+
4648
try:
4749
list(decoder.decode(mock_response))
4850
assert False, "Expected UTF-8 decoding error but none occurred"
4951
except UnicodeDecodeError as e:
5052
assert "can't decode byte 0x8b" in str(e)
5153
assert "invalid start byte" in str(e)
5254
print(f"✓ Reproduced the issue: {e}")
53-
55+
5456
gzip_parser = GzipParser(inner_parser=csv_parser)
5557
correct_decoder = CompositeRawDecoder(parser=gzip_parser, stream_response=False)
56-
58+
5759
mock_response.raw = io.BytesIO(compressed_data)
58-
60+
5961
records = list(correct_decoder.decode(mock_response))
6062
assert len(records) == 1
6163
assert records[0]["Account Id"] == "123"
@@ -69,22 +71,22 @@ def test_header_based_parser_selection():
6971
when Content-Encoding header is present.
7072
"""
7173
csv_data = "Account Id,Campaign\n123,Test\n"
72-
compressed_data = gzip.compress(csv_data.encode('utf-8'))
73-
74+
compressed_data = gzip.compress(csv_data.encode("utf-8"))
75+
7476
mock_response = Mock(spec=requests.Response)
7577
mock_response.content = compressed_data
7678
mock_response.raw = io.BytesIO(compressed_data)
7779
mock_response.headers = {"Content-Encoding": "gzip"}
78-
80+
7981
gzip_parser = GzipParser(inner_parser=CsvParser(encoding="utf-8"))
8082
fallback_parser = CsvParser(encoding="utf-8")
81-
83+
8284
decoder = CompositeRawDecoder.by_headers(
8385
parsers=[({"Content-Encoding"}, {"gzip"}, gzip_parser)],
8486
stream_response=False,
8587
fallback_parser=fallback_parser,
8688
)
87-
89+
8890
records = list(decoder.decode(mock_response))
8991
assert len(records) == 1
9092
assert records[0]["Account Id"] == "123"

0 commit comments

Comments
 (0)