From 53b1092ea1755168fd753e4fed5c046622783d26 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sat, 26 Dec 2020 11:30:24 +0100 Subject: [PATCH 1/8] Split single sitemap into index and sub-sitemaps per starting character --- src/web/routes.rs | 6 ++++- src/web/sitemap.rs | 46 ++++++++++++++++++++++++++++----- templates/core/sitemapindex.xml | 8 ++++++ 3 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 templates/core/sitemapindex.xml diff --git a/src/web/routes.rs b/src/web/routes.rs index 44b808471..2e813baff 100644 --- a/src/web/routes.rs +++ b/src/web/routes.rs @@ -16,7 +16,11 @@ pub(super) fn build_routes() -> Routes { // https://support.google.com/webmasters/answer/183668?hl=en routes.static_resource("/robots.txt", PermanentRedirect("/-/static/robots.txt")); routes.static_resource("/favicon.ico", PermanentRedirect("/-/static/favicon.ico")); - routes.static_resource("/sitemap.xml", super::sitemap::sitemap_handler); + routes.static_resource("/sitemap.xml", super::sitemap::sitemapindex_handler); + routes.static_resource( + "/-/sitemap/:which/sitemap.xml", + super::sitemap::sitemap_handler, + ); // This should not need to be served from the root as we reference the inner path in links, // but clients might have cached the url and need to update it. diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index 8c6ffa8c9..525485d10 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -5,9 +5,27 @@ use iron::{ mime::{Mime, SubLevel, TopLevel}, IronResult, Request, Response, }; +use router::Router; use serde::Serialize; use serde_json::Value; +/// sitemap index +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +struct SitemapIndexXml { + sitemaps: Vec, +} + +impl_webpage! { + SitemapIndexXml = "core/sitemapindex.xml", + content_type = ContentType(Mime(TopLevel::Application, SubLevel::Xml, vec![])), +} + +pub fn sitemapindex_handler(req: &mut Request) -> IronResult { + let sitemaps: Vec = (b'a'..=b'z').map(char::from).collect(); + + SitemapIndexXml { sitemaps }.into_response(req) +} + /// The sitemap #[derive(Debug, Clone, PartialEq, Eq, Serialize)] struct SitemapXml { @@ -21,16 +39,30 @@ impl_webpage! { } pub fn sitemap_handler(req: &mut Request) -> IronResult { + let router = extension!(req, Router); + let which = cexpect!(req, router.find("which")).to_lowercase(); + let mut conn = extension!(req, Pool).get()?; let query = conn .query( - "SELECT DISTINCT ON (crates.name) - crates.name, - releases.release_time + "SELECT crates.name, + MAX(releases.release_time) as release_time FROM crates INNER JOIN releases ON releases.crate_id = crates.id - WHERE rustdoc_status = true", - &[], + WHERE + rustdoc_status = true AND + ( + crates.name like $1 OR + crates.name like $2 + ) + GROUP BY crates.name + ", + &[ + // this LIKE pattern has the '%' only at the end, + // so postgres can use the index on `name` + &format!("{}%", which), + &format!("{}%", which.to_uppercase()), + ], ) .unwrap(); @@ -125,13 +157,15 @@ mod tests { wrapper(|env| { let web = env.frontend(); assert_success("/sitemap.xml", web)?; + assert_success("/-/sitemap/s/sitemap.xml", web)?; env.fake_release().name("some_random_crate").create()?; env.fake_release() .name("some_random_crate_that_failed") .build_result_successful(false) .create()?; - assert_success("/sitemap.xml", web) + assert_success("/sitemap.xml", web)?; + assert_success("/-/sitemap/s/sitemap.xml", web) }) } diff --git a/templates/core/sitemapindex.xml b/templates/core/sitemapindex.xml new file mode 100644 index 000000000..6c5c88184 --- /dev/null +++ b/templates/core/sitemapindex.xml @@ -0,0 +1,8 @@ + + + {% for which in sitemaps -%} + + https://docs.rs/-/sitemap/{{ which }}/sitemap.xml + + {%- endfor %} + From 8ee50fd41e984f082d0e42549f37c31893402d73 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 27 Dec 2020 07:40:20 +0100 Subject: [PATCH 2/8] rename route parameter name --- src/web/routes.rs | 2 +- src/web/sitemap.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/web/routes.rs b/src/web/routes.rs index 2e813baff..dbe888bd1 100644 --- a/src/web/routes.rs +++ b/src/web/routes.rs @@ -18,7 +18,7 @@ pub(super) fn build_routes() -> Routes { routes.static_resource("/favicon.ico", PermanentRedirect("/-/static/favicon.ico")); routes.static_resource("/sitemap.xml", super::sitemap::sitemapindex_handler); routes.static_resource( - "/-/sitemap/:which/sitemap.xml", + "/-/sitemap/:letter/sitemap.xml", super::sitemap::sitemap_handler, ); diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index 525485d10..079f688a7 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -40,7 +40,7 @@ impl_webpage! { pub fn sitemap_handler(req: &mut Request) -> IronResult { let router = extension!(req, Router); - let which = cexpect!(req, router.find("which")).to_lowercase(); + let letter = cexpect!(req, router.find("letter")).to_lowercase(); let mut conn = extension!(req, Pool).get()?; let query = conn @@ -60,8 +60,8 @@ pub fn sitemap_handler(req: &mut Request) -> IronResult { &[ // this LIKE pattern has the '%' only at the end, // so postgres can use the index on `name` - &format!("{}%", which), - &format!("{}%", which.to_uppercase()), + &format!("{}%", letter), + &format!("{}%", letter.to_uppercase()), ], ) .unwrap(); From 906720ab898b65531d29b8bca6855d4ca0d82b89 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 27 Dec 2020 07:50:23 +0100 Subject: [PATCH 3/8] add comment about psql LIKE index usage --- src/web/sitemap.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index 079f688a7..981c36c32 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -58,8 +58,11 @@ pub fn sitemap_handler(req: &mut Request) -> IronResult { GROUP BY crates.name ", &[ - // this LIKE pattern has the '%' only at the end, - // so postgres can use the index on `name` + // postgres can use the normal BTREE index on `name` + // for LIKE queries, if they are anchored to the + // beginning of the string. + // This does not work for ILIKE and alphabetic + // characters, hence the OR. &format!("{}%", letter), &format!("{}%", letter.to_uppercase()), ], From e293a5001d5f0864be222c56fe8ce49d2b4bae40 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 27 Dec 2020 09:11:47 +0100 Subject: [PATCH 4/8] add more tests, fail more often --- src/web/sitemap.rs | 73 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 7 deletions(-) diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index 981c36c32..d7a89b4e5 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -1,4 +1,4 @@ -use crate::{db::Pool, docbuilder::Limits, impl_webpage, web::page::WebPage}; +use crate::{db::Pool, docbuilder::Limits, impl_webpage, web::error::Nope, web::page::WebPage}; use chrono::{DateTime, Utc}; use iron::{ headers::ContentType, @@ -40,7 +40,15 @@ impl_webpage! { pub fn sitemap_handler(req: &mut Request) -> IronResult { let router = extension!(req, Router); - let letter = cexpect!(req, router.find("letter")).to_lowercase(); + let letter = cexpect!(req, router.find("letter")); + + if letter.len() != 1 { + return Err(Nope::ResourceNotFound.into()); + } else if let Some(ch) = letter.chars().next() { + if !(ch.is_ascii_lowercase()) { + return Err(Nope::ResourceNotFound.into()); + } + } let mut conn = extension!(req, Pool).get()?; let query = conn @@ -154,21 +162,72 @@ pub fn about_handler(req: &mut Request) -> IronResult { #[cfg(test)] mod tests { use crate::test::{assert_success, wrapper}; + use reqwest::StatusCode; + + #[test] + fn sitemap_index() { + wrapper(|env| { + let web = env.frontend(); + assert_success("/sitemap.xml", web) + }) + } + + #[test] + fn sitemap_invalid_letters() { + wrapper(|env| { + let web = env.frontend(); + + // everything not length=1 and ascii-lowercase should fail + for invalid_letter in &["1", "aa", "A", ""] { + println!("trying to fail letter {}", invalid_letter); + assert_eq!( + web.get(&format!("/-/sitemap/{}/sitemap.xml", invalid_letter)) + .send()? + .status(), + StatusCode::NOT_FOUND + ); + } + Ok(()) + }) + } #[test] - fn sitemap() { + fn sitemap_letter() { wrapper(|env| { let web = env.frontend(); - assert_success("/sitemap.xml", web)?; - assert_success("/-/sitemap/s/sitemap.xml", web)?; + + let letters: Vec = (b'a'..=b'z').map(char::from).collect(); + + // letter-sitemaps always work, even without crates & releases + for letter in letters.iter().as_ref() { + assert_success(&format!("/-/sitemap/{}/sitemap.xml", letter), web)?; + } env.fake_release().name("some_random_crate").create()?; env.fake_release() .name("some_random_crate_that_failed") .build_result_successful(false) .create()?; - assert_success("/sitemap.xml", web)?; - assert_success("/-/sitemap/s/sitemap.xml", web) + + // these fake crates appear only in the `s` sitemap + let response = web.get("/-/sitemap/s/sitemap.xml").send()?; + assert!(response.status().is_success()); + + let content = response.text()?; + assert!(content.contains(&"some_random_crate")); + assert!(!(content.contains(&"some_random_crate_that_failed"))); + + // and not in the others + for letter in letters.iter().filter(|&&c| c != 's') { + let response = web + .get(&format!("/-/sitemap/{}/sitemap.xml", letter)) + .send()?; + + assert!(response.status().is_success()); + assert!(!(response.text()?.contains("some_random_crate"))); + } + + Ok(()) }) } From ccc59c29cdecd9d8dc8b859e3497005ffc1fbc1f Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 27 Dec 2020 16:40:36 +0100 Subject: [PATCH 5/8] Update src/web/sitemap.rs Co-authored-by: Joshua Nelson --- src/web/sitemap.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index d7a89b4e5..72cf70d09 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -218,7 +218,7 @@ mod tests { assert!(!(content.contains(&"some_random_crate_that_failed"))); // and not in the others - for letter in letters.iter().filter(|&&c| c != 's') { + for letter in ('a'..='z').filter(|&&c| c != 's') { let response = web .get(&format!("/-/sitemap/{}/sitemap.xml", letter)) .send()?; From 0126fc8ef1ba558dbb8fb6c42d0d512a4c242df6 Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 27 Dec 2020 16:40:43 +0100 Subject: [PATCH 6/8] Update src/web/sitemap.rs Co-authored-by: Joshua Nelson --- src/web/sitemap.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index 72cf70d09..8f41cf8e5 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -196,10 +196,8 @@ mod tests { wrapper(|env| { let web = env.frontend(); - let letters: Vec = (b'a'..=b'z').map(char::from).collect(); - // letter-sitemaps always work, even without crates & releases - for letter in letters.iter().as_ref() { + for letter in 'a'..='z' { assert_success(&format!("/-/sitemap/{}/sitemap.xml", letter), web)?; } From 40c10bf1f6ec7cac774af693f41e12183c855c9a Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 27 Dec 2020 16:44:37 +0100 Subject: [PATCH 7/8] use char range where possible --- src/web/sitemap.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index 8f41cf8e5..d02c991aa 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -21,7 +21,7 @@ impl_webpage! { } pub fn sitemapindex_handler(req: &mut Request) -> IronResult { - let sitemaps: Vec = (b'a'..=b'z').map(char::from).collect(); + let sitemaps: Vec = ('a'..='z').collect(); SitemapIndexXml { sitemaps }.into_response(req) } @@ -216,7 +216,7 @@ mod tests { assert!(!(content.contains(&"some_random_crate_that_failed"))); // and not in the others - for letter in ('a'..='z').filter(|&&c| c != 's') { + for letter in ('a'..='z').filter(|&c| c != 's') { let response = web .get(&format!("/-/sitemap/{}/sitemap.xml", letter)) .send()?; From 62dcdef394b19b9c38e08b0b2698960ff133742d Mon Sep 17 00:00:00 2001 From: Denis Cornehl Date: Sun, 27 Dec 2020 16:56:31 +0100 Subject: [PATCH 8/8] switch from LIKE to ILIKE for better readability --- src/web/sitemap.rs | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/web/sitemap.rs b/src/web/sitemap.rs index d02c991aa..c08b167e0 100644 --- a/src/web/sitemap.rs +++ b/src/web/sitemap.rs @@ -59,21 +59,10 @@ pub fn sitemap_handler(req: &mut Request) -> IronResult { INNER JOIN releases ON releases.crate_id = crates.id WHERE rustdoc_status = true AND - ( - crates.name like $1 OR - crates.name like $2 - ) + crates.name ILIKE $1 GROUP BY crates.name ", - &[ - // postgres can use the normal BTREE index on `name` - // for LIKE queries, if they are anchored to the - // beginning of the string. - // This does not work for ILIKE and alphabetic - // characters, hence the OR. - &format!("{}%", letter), - &format!("{}%", letter.to_uppercase()), - ], + &[&format!("{}%", letter)], ) .unwrap();