Skip to content

Commit 1daffbe

Browse files
Nemo157Joshua Nelson
authored andcommitted
Refactor github updater to handle rate limits better
1 parent 4c7cdb6 commit 1daffbe

File tree

8 files changed

+179
-120
lines changed

8 files changed

+179
-120
lines changed

.env.sample

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
CRATESFYI_GITHUB_USERNAME=
2-
CRATESFYI_GITHUB_ACCESSTOKEN=
31
CRATESFYI_PREFIX=ignored/cratesfyi-prefix
42
CRATESFYI_DATABASE_URL=postgresql://cratesfyi:password@localhost
53
RUST_LOG=cratesfyi,rustwide=info

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ zstd = "0.5"
4444
git2 = { version = "0.13.6", default-features = false }
4545
path-slash = "0.1.3"
4646
once_cell = { version = "1.4.0", features = ["parking_lot"] }
47+
base64 = "0.12.1"
4748

4849
# Data serialization and deserialization
4950
serde = { version = "1.0", features = ["derive"] }

src/bin/cratesfyi.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,8 @@ impl DatabaseSubcommand {
441441
}
442442

443443
Self::UpdateGithubFields => {
444-
cratesfyi::utils::github_updater(&*ctx.conn()?)
445-
.expect("Failed to update github fields");
444+
cratesfyi::utils::GithubUpdater::new(&*ctx.config()?, ctx.pool()?)?
445+
.update_all_crates()?;
446446
}
447447

448448
Self::AddDirectory { directory, prefix } => {

src/config.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ pub struct Config {
1212
pub(crate) max_pool_size: u32,
1313
pub(crate) min_pool_idle: u32,
1414

15+
// Github authentication
16+
pub(crate) github_username: Option<String>,
17+
pub(crate) github_accesstoken: Option<String>,
18+
1519
// Max size of the files served by the docs.rs frontend
1620
pub(crate) max_file_size: usize,
1721
pub(crate) max_file_size_html: usize,
@@ -26,10 +30,20 @@ impl Config {
2630
max_pool_size: env("DOCSRS_MAX_POOL_SIZE", 90)?,
2731
min_pool_idle: env("DOCSRS_MIN_POOL_IDLE", 10)?,
2832

33+
github_username: maybe_env("CRATESFYI_GITHUB_USERNAME")?,
34+
github_accesstoken: maybe_env("CRATESFYI_GITHUB_ACCESSTOKEN")?,
35+
2936
max_file_size: env("DOCSRS_MAX_FILE_SIZE", 50 * 1024 * 1024)?,
3037
max_file_size_html: env("DOCSRS_MAX_FILE_SIZE_HTML", 5 * 1024 * 1024)?,
3138
})
3239
}
40+
41+
pub fn github_auth(&self) -> Option<(&str, &str)> {
42+
Some((
43+
self.github_username.as_deref()?,
44+
self.github_accesstoken.as_deref()?,
45+
))
46+
}
3347
}
3448

3549
fn env<T>(var: &str, default: T) -> Result<T, Error>
@@ -58,7 +72,10 @@ where
5872
.parse::<T>()
5973
.map(Some)
6074
.with_context(|_| format!("failed to parse configuration variable {}", var))?),
61-
Err(VarError::NotPresent) => Ok(None),
75+
Err(VarError::NotPresent) => {
76+
log::debug!("optional configuration variable {} is not set", var);
77+
Ok(None)
78+
}
6279
Err(VarError::NotUnicode(_)) => bail!("configuration variable {} is not UTF-8", var),
6380
}
6481
}

src/utils/daemon.rs

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
55
use crate::{
66
db::Pool,
7-
utils::{github_updater, queue_builder, update_release_activity},
7+
utils::{queue_builder, update_release_activity, GithubUpdater},
88
BuildQueue, Config, DocBuilder, DocBuilderOptions,
99
};
1010
use chrono::{Timelike, Utc};
@@ -21,11 +21,7 @@ pub fn start_daemon(
2121
build_queue: Arc<BuildQueue>,
2222
enable_registry_watcher: bool,
2323
) -> Result<(), Error> {
24-
const CRATE_VARIABLES: [&str; 3] = [
25-
"CRATESFYI_PREFIX",
26-
"CRATESFYI_GITHUB_USERNAME",
27-
"CRATESFYI_GITHUB_ACCESSTOKEN",
28-
];
24+
const CRATE_VARIABLES: &[&str] = &["CRATESFYI_PREFIX"];
2925

3026
// first check required environment variables
3127
for v in CRATE_VARIABLES.iter() {
@@ -96,13 +92,13 @@ pub fn start_daemon(
9692
},
9793
)?;
9894

99-
// update github stats every 6 hours
100-
let cloned_db = db.clone();
95+
// update github stats every hour
96+
let github_updater = GithubUpdater::new(&config, db.clone())?;
10197
cron(
10298
"github stats updater",
103-
Duration::from_secs(60 * 60 * 6),
99+
Duration::from_secs(60 * 60),
104100
move || {
105-
github_updater(&*cloned_db.get()?)?;
101+
github_updater.update_all_crates()?;
106102
Ok(())
107103
},
108104
)?;

src/utils/github_updater.rs

Lines changed: 150 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
11
use crate::error::Result;
2+
use crate::{db::Pool, Config};
23
use chrono::{DateTime, Utc};
34
use failure::err_msg;
4-
use log::debug;
5+
use log::{debug, warn};
56
use postgres::Connection;
67
use regex::Regex;
7-
use std::str::FromStr;
8+
use reqwest::header::{HeaderValue, ACCEPT, AUTHORIZATION, USER_AGENT};
9+
use serde::Deserialize;
10+
11+
const APP_USER_AGENT: &str = concat!(
12+
env!("CARGO_PKG_NAME"),
13+
" ",
14+
include_str!(concat!(env!("OUT_DIR"), "/git_version"))
15+
);
816

917
/// Fields we need use in cratesfyi
1018
#[derive(Debug)]
@@ -16,115 +24,153 @@ struct GitHubFields {
1624
last_commit: DateTime<Utc>,
1725
}
1826

19-
/// Updates github fields in crates table
20-
pub fn github_updater(conn: &Connection) -> Result<()> {
21-
// TODO: This query assumes repository field in Cargo.toml is
22-
// always the same across all versions of a crate
23-
for row in &conn.query(
24-
"SELECT DISTINCT ON (crates.name)
25-
crates.name,
26-
crates.id,
27-
releases.repository_url
28-
FROM crates
29-
INNER JOIN releases ON releases.crate_id = crates.id
30-
WHERE releases.repository_url ~ '^https?://github.com' AND
31-
(crates.github_last_update < NOW() - INTERVAL '1 day' OR
32-
crates.github_last_update IS NULL)
33-
ORDER BY crates.name, releases.release_time DESC",
34-
&[],
35-
)? {
36-
let crate_name: String = row.get(0);
37-
let crate_id: i32 = row.get(1);
38-
let repository_url: String = row.get(2);
39-
40-
if let Err(err) = get_github_path(&repository_url[..])
41-
.ok_or_else(|| err_msg("Failed to get github path"))
42-
.and_then(|path| get_github_fields(&path[..]))
43-
.and_then(|fields| {
44-
conn.execute(
45-
"UPDATE crates
46-
SET github_description = $1,
47-
github_stars = $2, github_forks = $3,
48-
github_issues = $4, github_last_commit = $5,
49-
github_last_update = NOW()
50-
WHERE id = $6",
51-
&[
52-
&fields.description,
53-
&(fields.stars as i32),
54-
&(fields.forks as i32),
55-
&(fields.issues as i32),
56-
&fields.last_commit.naive_utc(),
57-
&crate_id,
58-
],
59-
)
60-
.or_else(|e| Err(e.into()))
61-
})
62-
{
63-
debug!("Failed to update github fields of: {} {}", crate_name, err);
27+
pub struct GithubUpdater {
28+
client: reqwest::blocking::Client,
29+
pool: Pool,
30+
}
31+
32+
impl GithubUpdater {
33+
pub fn new(config: &Config, pool: Pool) -> Result<Self> {
34+
let mut headers = vec![
35+
(USER_AGENT, HeaderValue::from_static(APP_USER_AGENT)),
36+
(ACCEPT, HeaderValue::from_static("application/json")),
37+
];
38+
39+
if let Some((username, accesstoken)) = config.github_auth() {
40+
let basicauth = format!(
41+
"Basic {}",
42+
base64::encode(format!("{}:{}", username, accesstoken))
43+
);
44+
headers.push((AUTHORIZATION, HeaderValue::from_str(&basicauth).unwrap()));
45+
} else {
46+
warn!("No GitHub authorization specified, will be working with very low rate limits");
6447
}
6548

66-
// sleep for rate limits
67-
use std::thread;
68-
use std::time::Duration;
69-
thread::sleep(Duration::from_secs(2));
49+
let client = reqwest::blocking::Client::builder()
50+
.default_headers(headers.into_iter().collect())
51+
.build()?;
52+
53+
Ok(GithubUpdater { client, pool })
7054
}
7155

72-
Ok(())
73-
}
56+
/// Updates github fields in crates table
57+
pub fn update_all_crates(&self) -> Result<()> {
58+
debug!("Starting update of all crates");
59+
60+
if self.is_rate_limited()? {
61+
warn!("Skipping update because of rate limit");
62+
return Ok(());
63+
}
64+
65+
let conn = self.pool.get()?;
66+
// TODO: This query assumes repository field in Cargo.toml is
67+
// always the same across all versions of a crate
68+
let rows = conn.query(
69+
"SELECT DISTINCT ON (crates.name)
70+
crates.name,
71+
crates.id,
72+
releases.repository_url
73+
FROM crates
74+
INNER JOIN releases ON releases.crate_id = crates.id
75+
WHERE releases.repository_url ~ '^https?://github.com' AND
76+
(crates.github_last_update < NOW() - INTERVAL '1 day' OR
77+
crates.github_last_update IS NULL)
78+
ORDER BY crates.name, releases.release_time DESC",
79+
&[],
80+
)?;
81+
82+
for row in &rows {
83+
let crate_name: String = row.get(0);
84+
let crate_id: i32 = row.get(1);
85+
let repository_url: String = row.get(2);
86+
87+
debug!("Updating {}", crate_name);
88+
if let Err(err) = self.update_crate(&conn, crate_id, &repository_url) {
89+
if self.is_rate_limited()? {
90+
warn!("Skipping remaining updates because of rate limit");
91+
return Ok(());
92+
}
93+
warn!("Failed to update {}: {}", crate_name, err);
94+
}
95+
}
96+
97+
debug!("Completed all updates");
98+
Ok(())
99+
}
100+
101+
fn is_rate_limited(&self) -> Result<bool> {
102+
#[derive(Deserialize)]
103+
struct Response {
104+
resources: Resources,
105+
}
106+
107+
#[derive(Deserialize)]
108+
struct Resources {
109+
core: Resource,
110+
}
111+
112+
#[derive(Deserialize)]
113+
struct Resource {
114+
remaining: u64,
115+
}
116+
117+
let url = "https://api.github.com/rate_limit";
118+
let response: Response = self.client.get(url).send()?.error_for_status()?.json()?;
74119

75-
fn get_github_fields(path: &str) -> Result<GitHubFields> {
76-
use serde_json::Value;
77-
78-
let body = {
79-
use reqwest::{blocking::Client, header::USER_AGENT, StatusCode};
80-
use std::{env, io::Read};
81-
82-
let client = Client::new();
83-
let mut body = String::new();
84-
85-
let mut resp = client
86-
.get(&format!("https://api.github.com/repos/{}", path)[..])
87-
.header(
88-
USER_AGENT,
89-
format!("cratesfyi/{}", env!("CARGO_PKG_VERSION")),
90-
)
91-
.basic_auth(
92-
env::var("CRATESFYI_GITHUB_USERNAME")
93-
.ok()
94-
.unwrap_or_default(),
95-
env::var("CRATESFYI_GITHUB_ACCESSTOKEN").ok(),
96-
)
97-
.send()?;
98-
99-
if resp.status() != StatusCode::OK {
100-
return Err(err_msg("Failed to get github data"));
120+
Ok(response.resources.core.remaining == 0)
121+
}
122+
123+
fn update_crate(&self, conn: &Connection, crate_id: i32, repository_url: &str) -> Result<()> {
124+
let path =
125+
get_github_path(repository_url).ok_or_else(|| err_msg("Failed to get github path"))?;
126+
let fields = self.get_github_fields(&path)?;
127+
128+
conn.execute(
129+
"UPDATE crates
130+
SET github_description = $1,
131+
github_stars = $2, github_forks = $3,
132+
github_issues = $4, github_last_commit = $5,
133+
github_last_update = NOW()
134+
WHERE id = $6",
135+
&[
136+
&fields.description,
137+
&(fields.stars as i32),
138+
&(fields.forks as i32),
139+
&(fields.issues as i32),
140+
&fields.last_commit.naive_utc(),
141+
&crate_id,
142+
],
143+
)?;
144+
145+
Ok(())
146+
}
147+
148+
fn get_github_fields(&self, path: &str) -> Result<GitHubFields> {
149+
#[derive(Deserialize)]
150+
struct Response {
151+
#[serde(default)]
152+
description: Option<String>,
153+
#[serde(default)]
154+
stargazers_count: i64,
155+
#[serde(default)]
156+
forks_count: i64,
157+
#[serde(default)]
158+
open_issues: i64,
159+
#[serde(default = "Utc::now")]
160+
pushed_at: DateTime<Utc>,
101161
}
102162

103-
resp.read_to_string(&mut body)?;
104-
body
105-
};
106-
107-
let json = Value::from_str(&body[..])?;
108-
let obj = json.as_object().unwrap();
109-
110-
Ok(GitHubFields {
111-
description: obj
112-
.get("description")
113-
.and_then(|d| d.as_str())
114-
.unwrap_or("")
115-
.to_string(),
116-
stars: obj
117-
.get("stargazers_count")
118-
.and_then(|d| d.as_i64())
119-
.unwrap_or(0),
120-
forks: obj.get("forks_count").and_then(|d| d.as_i64()).unwrap_or(0),
121-
issues: obj.get("open_issues").and_then(|d| d.as_i64()).unwrap_or(0),
122-
last_commit: DateTime::parse_from_rfc3339(
123-
obj.get("pushed_at").and_then(|d| d.as_str()).unwrap_or(""),
124-
)
125-
.map(|datetime| datetime.with_timezone(&Utc))
126-
.unwrap_or_else(|_| Utc::now()),
127-
})
163+
let url = format!("https://api.github.com/repos/{}", path);
164+
let response: Response = self.client.get(&url).send()?.error_for_status()?.json()?;
165+
166+
Ok(GitHubFields {
167+
description: response.description.unwrap_or_default(),
168+
stars: response.stargazers_count,
169+
forks: response.forks_count,
170+
issues: response.open_issues,
171+
last_commit: response.pushed_at,
172+
})
173+
}
128174
}
129175

130176
fn get_github_path(url: &str) -> Option<String> {

0 commit comments

Comments
 (0)