Skip to content

Commit 82e8693

Browse files
authored
Merge pull request #38 from cyrossignol/http_etag_fix
Optimize etag header parsing
2 parents 58cd372 + de6a107 commit 82e8693

File tree

1 file changed

+47
-47
lines changed

1 file changed

+47
-47
lines changed

src/scraper/http.cpp

Lines changed: 47 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,49 @@ namespace
138138
};
139139

140140
#endif
141+
142+
//!
143+
//! \brief Parse an etag value from an HTTP header field.
144+
//!
145+
//! This will parse etag values from headers in these standard formats:
146+
//!
147+
//! ETag: "12345"
148+
//! ETag: W/"12345"
149+
//!
150+
//! The name of the header is matched in a case-insensitive fashion. This
151+
//! function will return an empty string for non-standard, malformed, and
152+
//! non-etag headers. It removes the quotes from the output and ignores a
153+
//! space after the colon that separates the header name from the value.
154+
//!
155+
//! \param header Entire HTTP header field that includes the name and value.
156+
//!
157+
//! \return The parsed etag value or an empty string if the supplied header
158+
//! contains no standard etag content.
159+
//!
160+
std::string ParseEtag(const std::string& header)
161+
{
162+
if (header.size() <= 8 || header[4] != ':') {
163+
return std::string();
164+
}
165+
166+
constexpr char expected[] = "etag";
167+
constexpr int32_t to_upper = 32;
168+
169+
for (size_t i = 0; i < 4; ++i) {
170+
if (header[i] != expected[i] && header[i] != expected[i] - to_upper) {
171+
return std::string();
172+
}
173+
}
174+
175+
const size_t start_quote = header.find('"', 5);
176+
const size_t end_quote = header.find('"', start_quote + 1);
177+
178+
if (start_quote == std::string::npos || end_quote == std::string::npos) {
179+
return std::string();
180+
}
181+
182+
return header.substr(start_quote + 1, end_quote - start_quote - 1);
183+
}
141184
} // anonymous namespace
142185

143186
Http::CurlLifecycle::CurlLifecycle()
@@ -204,63 +247,20 @@ std::string Http::GetEtag(
204247
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &response_code);
205248
EvaluateResponse(response_code, url);
206249

207-
// Find ETag header.
208-
std::string etag;
209-
210250
_log(logattribute::INFO, "Http::ETag", "Header: \n" + header);
211251

212252
std::istringstream iss(header);
213253
for (std::string line; std::getline(iss, line);)
214254
{
215-
std::vector<std::string> header_line_elements = split(line, ":", "\"");
216-
217-
std::vector<std::string> trimmed_stripped_elements;
218-
219-
for (const auto& elem : header_line_elements)
220-
{
221-
std::string output = elem;
255+
std::string etag = ParseEtag(line);
222256

223-
// Get rid of leading and trailing spaces for all fields.
224-
boost::algorithm::trim(output);
225-
226-
// Get rid of quotes.
227-
boost::replace_all(output, "\"", "");
228-
229-
trimmed_stripped_elements.push_back(output);
230-
}
231-
232-
std::string header_name;
233-
234-
if (header_line_elements.size())
235-
{
236-
header_name = trimmed_stripped_elements[0];
237-
238-
// Change everything in header field name to lower case.
239-
boost::to_lower(header_name);
240-
}
241-
242-
if (header_name == "etag" && header_line_elements.size() == 2)
257+
if (!etag.empty())
243258
{
244-
etag = trimmed_stripped_elements[1];
245-
246-
// If the ETag has a "weak" suffix, we don't want the forward slash.
247-
boost::replace_all(etag, "W/", "W");
248-
249-
if(etag.size())
250-
{
251-
_log(logattribute::INFO, "curl_http_header", "Captured ETag for project url <urlfile=" + url + ", ETag=" + etag + ">");
252-
253-
return etag;
254-
}
259+
return etag;
255260
}
256261
}
257262

258-
if (etag.empty())
259-
{
260-
throw std::runtime_error("No ETag response from project url <urlfile=" + url + ">");
261-
}
262-
263-
return std::string();
263+
throw std::runtime_error("No ETag response from project url <urlfile=" + url + ">");
264264
}
265265

266266
std::string Http::GetLatestVersionResponse()

0 commit comments

Comments
 (0)