From 4c4381169e82f15a3244d70797092c09bcc225b2 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 26 Apr 2017 18:33:14 -0400 Subject: [PATCH 1/3] Add lots of documentation --- docs/ARCHITECTURE.md | 104 ++++++++++++++++++++++++++++++++++++++ docs/BACKEND.md | 118 +++++++++++++++++++++++++++++++++++++++++++ docs/FRONTEND.md | 7 +++ src/app.rs | 24 ++++++++- src/bin/server.rs | 41 +++++++++++++-- src/lib.rs | 55 ++++++++++++++++++-- 6 files changed, 338 insertions(+), 11 deletions(-) create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/BACKEND.md create mode 100644 docs/FRONTEND.md diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 00000000000..7ebe4820d2a --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,104 @@ +# Architecture of Crates.io + +This document is an intro to the codebase in this repo. If you want to work on a bug or a feature, +hopefully after reading this doc, you'll have a good idea of where to start looking for the code +you want to change. + +This is a work in progress. Pull requests and issues to improve this document are very welcome! + +## Documentation + +Documentation about the codebase appears in these locations: + +* `LICENSE-APACHE` and `LICENSE-MIT` - the terms under which this codebase is licensed. +* `README.md` - Important information we want to show on the github front page. +* `docs/` - Long-form documentation. + +## Backend - Rust + +The backend of crates.io is written in Rust. Most of that code lives in the *src* directory. It +serves a JSON API over HTTP, and the HTTP server interface is provided by the [conduit][] crate and +related crates. More information about the backend is in +[`docs/BACKEND.md`](https://github.com/rust-lang/crates.io/blob/master/docs/BACKEND.md). + +[conduit]: https://crates.io/crates/conduit + +These files have to do with the backend: + +* `build.rs` - Cargo build script +* `Cargo.lock` - Locks dependencies to specific versions providing consistency across development + and deployment +* `Cargo.toml` - Defines the crate and its dependencies +* `migrations/` - Diesel migrations applied to the database during development and deployment +* `.rustfmt.toml` - Defines Rust coding style guidelines which are enforced by the CI environment +* `src/` - The backend's source code +* `target/` - Compiled output, including dependencies and final binary artifacts - (ignored in + `.gitignore`) +* `tmp/index-co` - The registry repository; in production this is cloned from Github and in + development from `tmp/index-bare` - (ignored in `.gitignore`) + +The backend stores information in a Postgres database. + +## Frontend - Ember.js + +The frontend of crates.io is written in JavaScript using [Ember.js][]. More information about the +frontend is in [`docs/FRONTEND.md`](https://github.com/rust-lang/crates.io/blob/master/docs/FRONTEND.md). + +[Ember.js]: https://emberjs.com/ + +These files have to do with the frontend: + +* `app/` - The frontend's source code +* `config/{environment,targets}.js` - Configuration of the frontend +* `dist/` - Contains the distributable (optimized and self-contained) output of building the + frontend; served under the root `/` url - (ignored in `.gitignore`) +* `.ember-cli` - Settings for the `ember` command line interface +* `ember-cli-build.js` - Contains the build specification for Broccoli +* `.eslintrc.js` - Defines Javascript coding style guidelines (enforced during CI???) +* `mirage/` - A mock backend used during development and testing +* `node_modules/` - npm dependencies - (ignored in `.gitignore`) +* `package.json` - Defines the npm package and its dependencies +* `package-lock.json` - Locks dependencies to specific versions providing consistency across + development and deployment +* `public/` - Static files that are merged into `dist/` during build +* `testem.js` - Integration with Test'em Scripts +* `tests/` - Frontend tests +* `vendor/` - frontend dependencies not distributed by npm; not currently used + +## Deployment - Heroku + +Crates.io is deployed on [Heroku][https://heroku.com/]. See [`docs/MIRROR.md`][] for info about +setting up your own instance on Heroku! + +[`docs/MIRROR.md`]: https://github.com/rust-lang/crates.io/blob/master/docs/MIRROR.md + +These files are Heroku-specific; if you're deploying the crates.io codebase on another platform, +there's useful information in these files that you might need to translate to a different format +for another platform. + +* `app.json` - Configuration for Heroku Deploy Button +* `.buildpacks` - A list of buildpacks used during deployment +* `config/nginx.conf.erb` - Template used by the nginx buildpack +* `.diesel_version` - Used by diesel buildpack to install a specific version of Diesel CLI during + deployment +* `Procfile` - Contains process type declarations for Heroku + +## Development + +These files are mostly only relevant when running crates.io's code in development mode. + +* `.editorconfig` - Coding style definitions supported by some IDEs // TODO: Reference extensions + for common editors +* `.env` - Environment variables loaded by the backend - (ignored in `.gitignore`) +* `.env.sample` - Example environment file checked into the repository +* `.git/` - The git repository; not available in all deployments (e.g. Heroku) +* `.gitignore` - Configures git to ignore certain files and folders +* `script/init-local-index.sh` - Creates registry repositories used during development +* `tmp/` - Temporary files created during development; when deployed on Heroku this is the only + writable directory - (ignored in `.gitignore`) +* `tmp/index-bare` - A bare git repository, used as the origin for `tmp/index-co` during + development - (ignored in `.gitignore`) +* `.travis.yml` - Configuration for continous integration at [TravisCI][] +* `.watchmanconfig` - Use by Ember CLI to efficiently watch for file changes if you install watchman + +[TravisCI]: https://travis-ci.org/rust-lang/crates.io diff --git a/docs/BACKEND.md b/docs/BACKEND.md new file mode 100644 index 00000000000..205853d762a --- /dev/null +++ b/docs/BACKEND.md @@ -0,0 +1,118 @@ +# Backend Overview + +## Server + +The code to actually run the server is in *src/bin/server.rs*. This is where most of the pieces of +the system are instantiated and configured, and can be thought of as the "entry point" to crates.io. + +The server does the following things: + +1. Initialize logging +2. Check out the index git repository, if it isn't already checked out +3. Reads values from environment variables to configure a new instance of `cargo_registry::App` +4. Adds middleware to the app by calling `cargo_registry::middleware` +5. Syncs the categories defined in *src/categories.toml* with the categories in the database +6. Starts a [civet][] `Server` that uses the `cargo_registry::App` instance +7. Tells Nginx on Heroku that the application is ready to receive requests, if running on Heroku +8. Blocks forever (or until the process is killed) waiting to receive messages on a channel that no + messages are ever sent to, in order to outive the civet `Server` threads + +[civet]: https://crates.io/crates/civet + +## Routes + +The API URLs that the server responds to (aka "routes") are defined in +*src/lib.rs*. + +All of the `api_router` routes are mounted under the `/api/v1` path (see the +lines that look like `router.get("/api/v1/*path", R(api_router.clone()));`). + +Each API route definition looks like this: + +```rust +api_router.get("/crates", C(krate::index)); +``` + +This line defines a route that responds to a GET request made to +`/api/v1/crates` with the results of calling the `krate::index` function. `C` +is a struct that holds a function and implements the [`conduit::Handler`][] +trait so that the results of the function are the response if the function +succeeds, and that the server returns an error response if the function doesn't +succeed. The `C` struct's purpose is to reduce some boilerplate. + +[`conduit::Handler`]: https://docs.rs/conduit/0.8.1/conduit/trait.Handler.html + +## Code having to do with running a web application + +These modules could *maybe* be refactored into another crate. Maybe not. But their primary purpose +is supporting the running of crates.io's web application parts, and they don't have much to do with +the crate registry purpose of the application. + +### The `app` module + +This contains the `App` struct, which holds a `Config` instance plus a few more application +components such as: + +- The database connection pools (there are two until we finish migrating the app to use Diesel + everywhere) +- The GitHub OAuth configuration +- The cookie session key given to [conduit-cookie][] +- The `git2::Repository` instance for the index repo checkout +- The `Config` instance + +This module also contains `AppMiddleware`, which implements the `Middleware` trait in order to +inject the `app` instance into every request. That way, we can call `req.app()` to get to any of +these components. + +[conduit-cookie]: https://crates.io/crates/conduit-cookie + +### The `config` module + +### The `db` module + +### The `dist` module + +### The `http` module + +### The `model` module + +### The `schema` module + +### The `utils` module + +## Code having to do with managing a registry of crates + +These modules are specific to the domain of being a crate registry. These concepts would exist no +matter what language or framework crates.io was implemented in. + +### The `krate` module + +### The `users` module + +### The `badge` module + +### The `categories` module + +### The `category` module + +### The `dependency` module + +### The `download` module + +### The `git` module + +### The `keyword` module + +### The `owner` module + +### The `upload` module + +### The `uploaders` module + +### The `version` module + +## Database + +## Tests + +## Scripts diff --git a/docs/FRONTEND.md b/docs/FRONTEND.md new file mode 100644 index 00000000000..6a7ae29f740 --- /dev/null +++ b/docs/FRONTEND.md @@ -0,0 +1,7 @@ +# Frontend Overview + +The frontend of crates.io is written in JavaScript using [Ember.js][]. Most of that code lives in +the *src* directory. We endeavor to follow Ember conventions and best practices, but we're Rust +developers, so we don't always live up to this goal :) + +[Ember.js]: https://emberjs.com/ diff --git a/src/app.rs b/src/app.rs index cf99744f9e6..3f8708cff6d 100644 --- a/src/app.rs +++ b/src/app.rs @@ -1,3 +1,5 @@ +//! Application-wide components in a struct accessible from each request + use std::env; use std::error::Error; use std::path::PathBuf; @@ -18,14 +20,19 @@ pub struct App { /// The database connection pool pub database: db::Pool, - /// The database connection pool + /// The diesel database connection pool pub diesel_database: db::DieselPool, /// The GitHub OAuth2 configuration pub github: oauth2::Config, + /// A unique key used with conduit_cookie to generate cookies pub session_key: String, + + /// The crate index git repository pub git_repo: Mutex, + + /// The location on disk of the checkout of the crate index git repository pub git_repo_checkout: PathBuf, /// The server configuration @@ -38,6 +45,13 @@ pub struct AppMiddleware { } impl App { + /// Creates a new `App` with a given `Config` + /// + /// Configures and sets up: + /// + /// - GitHub OAuth + /// - Database connection pools + /// - A `git2::Repository` instance from the index repo checkout (that server.rs ensures exists) pub fn new(config: &Config) -> App { let mut github = oauth2::Config::new( &config.gh_client_id, @@ -45,7 +59,6 @@ impl App { "https://github.com/login/oauth/authorize", "https://github.com/login/oauth/access_token", ); - github.scopes.push(String::from("read:org")); let db_pool_size = match (env::var("DB_POOL_SIZE"), config.env) { @@ -66,6 +79,7 @@ impl App { _ => 1, }; + // We need two connection pools until we finish transitioning everything to use diesel. let db_config = r2d2::Config::builder() .pool_size(db_pool_size) .min_idle(db_min_idle) @@ -78,6 +92,7 @@ impl App { .build(); let repo = git2::Repository::open(&config.git_repo_checkout).unwrap(); + App { database: db::pool(&config.db_url, db_config), diesel_database: db::diesel_pool(&config.db_url, diesel_db_config), @@ -89,6 +104,11 @@ impl App { } } + /// Returns a handle for making HTTP requests to upload crate files. + /// + /// The handle will go through a proxy if the uploader being used has specified one, which + /// is only done in test mode in order to be able to record and inspect the HTTP requests + /// that tests make. pub fn handle(&self) -> Easy { let mut handle = Easy::new(); if let Some(proxy) = self.config.uploader.proxy() { diff --git a/src/bin/server.rs b/src/bin/server.rs index 8cf6099e56b..4b52191d64d 100644 --- a/src/bin/server.rs +++ b/src/bin/server.rs @@ -17,10 +17,15 @@ use std::sync::mpsc::channel; #[allow(dead_code)] fn main() { + // Initialize logging env_logger::init().unwrap(); + + // If there isn't a git checkout containing the crate index repo at the path specified + // by `GIT_REPO_CHECKOUT`, delete that directory and clone the repo specified by `GIT_REPO_URL` + // into that directory instead. Uses the credentials specified in `GIT_HTTP_USER` and + // `GIT_HTTP_PWD` via the `cargo_registry::git::credentials` function. let url = env("GIT_REPO_URL"); let checkout = PathBuf::from(env("GIT_REPO_CHECKOUT")); - let repo = match git2::Repository::open(&checkout) { Ok(r) => r, Err(..) => { @@ -36,11 +41,15 @@ fn main() { .unwrap() } }; + + // All commits to the index registry made through crates.io will be made by bors, the Rust + // community's friendly GitHub bot. let mut cfg = repo.config().unwrap(); cfg.set_str("user.name", "bors").unwrap(); cfg.set_str("user.email", "bors@rust-lang.org").unwrap(); let api_protocol = String::from("https"); + let mirror = if env::var("MIRROR").is_ok() { Replica::ReadOnlyMirror } else { @@ -56,7 +65,9 @@ fn main() { let uploader = match (cargo_env, mirror) { (Env::Production, Replica::Primary) => { - // `env` panics if these vars are not set + // `env` panics if these vars are not set, and in production for a primary instance, + // that's what we want since we don't want to be able to start the server if the server + // doesn't know where to upload crates. Uploader::S3 { bucket: s3::Bucket::new( env("S3_BUCKET"), @@ -69,8 +80,14 @@ fn main() { } } (Env::Production, Replica::ReadOnlyMirror) => { - // Read-only mirrors don't need access key or secret key, - // but they might have them. Definitely need bucket though. + // Read-only mirrors don't need access key or secret key since by definition, + // they'll only need to read from a bucket, not upload. + // + // Read-only mirrors might have access key or secret key, so use them if those + // environment variables are set. + // + // Read-only mirrors definitely need bucket though, so that they know where + // to serve crate files from. Uploader::S3 { bucket: s3::Bucket::new( env("S3_BUCKET"), @@ -82,8 +99,13 @@ fn main() { proxy: None, } } + // In Development mode, either running as a primary instance or a read-only mirror _ => { if env::var("S3_BUCKET").is_ok() { + // If we've set the `S3_BUCKET` variable to any value, use all of the values + // for the related S3 environment variables and configure the app to upload to + // and read from S3 like production does. All values except for bucket are + // optional, like production read-only mirrors. println!("Using S3 uploader"); Uploader::S3 { bucket: s3::Bucket::new( @@ -96,6 +118,9 @@ fn main() { proxy: None, } } else { + // If we don't set the `S3_BUCKET` variable, we'll use a development-only + // uploader that makes it possible to run and publish to a locally-running + // crates.io instance without needing to set up an account and a bucket in S3. println!("Using local uploader, crate files will be in the dist directory"); Uploader::Local } @@ -110,13 +135,15 @@ fn main() { gh_client_secret: env("GH_CLIENT_SECRET"), db_url: env("DATABASE_URL"), env: cargo_env, - max_upload_size: 10 * 1024 * 1024, + max_upload_size: 10 * 1024 * 1024, // 10 MB default file upload size limit mirror: mirror, api_protocol: api_protocol, }; let app = cargo_registry::App::new(&config); let app = cargo_registry::middleware(Arc::new(app)); + // On every server restart, ensure the categories available in the database match + // the information in *src/categories.toml*. cargo_registry::categories::sync().unwrap(); let port = if heroku { @@ -131,7 +158,11 @@ fn main() { let mut cfg = civet::Config::new(); cfg.port(port).threads(threads).keep_alive(true); let _a = Server::start(cfg, app); + println!("listening on port {}", port); + + // Creating this file tells heroku to tell nginx that the application is ready + // to receive traffic. if heroku { File::create("/tmp/app-initialized").unwrap(); } diff --git a/src/lib.rs b/src/lib.rs index bf446745b96..b9b3e57f161 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,9 @@ //! This crate implements the backend server for https://crates.io/ //! //! All implemented routes are defined in the [middleware](fn.middleware.html) function and -//! implemented in the [keyword](keyword/index.html), [krate](krate/index.html), -//! [user](user/index.html) and [version](version/index.html) modules. +//! implemented in the [category](category/index.html), [keyword](keyword/index.html), +//! [krate](krate/index.html), [user](user/index.html) and [version](version/index.html) modules. + #![deny(warnings)] #![cfg_attr(feature = "clippy", feature(plugin))] #![cfg_attr(feature = "clippy", plugin(clippy))] @@ -95,6 +96,12 @@ pub mod version; mod pagination; +/// Used for setting different values depending on whether the app is being run in production, +/// in development, or for testing. +/// +/// The app's `config.env` value is set in *src/bin/server.rs* to `Production` if the environment +/// variable `HEROKU` is set and `Development` otherwise. `config.env` is set to `Test` +/// unconditionally in *src/test/all.rs*. #[derive(PartialEq, Eq, Clone, Copy, Debug)] pub enum Env { Development, @@ -102,14 +109,26 @@ pub enum Env { Production, } -// There may be more ways to run crates.io servers in the future, such as a -// mirror that also has private crates that crates.io does not have. +/// Used for setting different values depending on the type of registry this instance is. +/// +/// `Primary` indicates this instance is a primary registry that is the source of truth for these +/// crates' information. `ReadOnlyMirror` indicates this instanceis a read-only mirror of crate +/// information that exists on another instance. +/// +/// The app's `config.mirror` value is set in *src/bin/server.rs* to `ReadOnlyMirror` if the +/// `MIRROR` environment variable is set and to `Primary` otherwise. +/// +/// There may be more ways to run crates.io servers in the future, such as a +/// mirror that also has private crates that crates.io does not have. #[derive(PartialEq, Eq, Clone, Copy, Debug)] pub enum Replica { Primary, ReadOnlyMirror, } +/// Configures routes, sessions, logging, and other middleware. +/// +/// Called from *src/bin/server.rs*. pub fn middleware(app: Arc) -> MiddlewareBuilder { let mut api_router = RouteBuilder::new(); @@ -175,6 +194,9 @@ pub fn middleware(app: Arc) -> MiddlewareBuilder { router.delete("/me/tokens/:id", C(token::revoke)); router.get("/summary", C(krate::summary)); + // Only serve the local checkout of the git index in development mode. + // In production, for crates.io, cargo gets the index from + // https://github.com/rust-lang/crates.io-index directly. let env = app.config.env; if env == Env::Development { let s = conduit_git_http_backend::Serve(app.git_repo_checkout.clone()); @@ -184,12 +206,16 @@ pub fn middleware(app: Arc) -> MiddlewareBuilder { } let mut m = MiddlewareBuilder::new(R404(router)); + if env == Env::Development { + // DebugMiddleware is defined below to print logs for each request. m.add(DebugMiddleware); } + if env != Env::Test { m.add(conduit_log_requests::LogRequests(log::LogLevel::Info)); } + m.around(util::Head::default()); m.add(conduit_conditional_get::ConditionalGet); m.add(conduit_cookie::Middleware::new(app.session_key.as_bytes())); @@ -198,10 +224,18 @@ pub fn middleware(app: Arc) -> MiddlewareBuilder { env == Env::Production, )); m.add(app::AppMiddleware::new(app)); + + // Run each request in a transaction and roll back the transaction if the request results + // in an error. Not used when running tests because each test is run in a transaction. if env != Env::Test { m.add(db::TransactionMiddleware); } + + // Sets the current user on each request. m.add(user::Middleware); + + // Serve the static files in the *dist* directory, which are the frontend assets. + // Not needed for the backend tests. if env != Env::Test { m.around(dist::Middleware::default()); } @@ -240,14 +274,27 @@ pub fn middleware(app: Arc) -> MiddlewareBuilder { } } +/// Convenience function for getting the current server time in UTC. pub fn now() -> time::Timespec { time::now_utc().to_timespec() } +/// Convenience function for getting a time in RFC 3339 format. +/// +/// Example: `2012-02-22T14:53:18Z`. Used for returning time values in JSON API responses. pub fn encode_time(ts: time::Timespec) -> String { time::at_utc(ts).rfc3339().to_string() } +/// Convenience function requiring that an environment variable is set. +/// +/// Ensures that we've initialized the dotenv crate in order to read environment variables +/// from a *.env* file if present. Don't use this for optionally set environment variables. +/// +/// # Panics +/// +/// Panics if the environment variable with the name passed in as an argument is not defined +/// in the current environment. pub fn env(s: &str) -> String { dotenv::dotenv().ok(); ::std::env::var(s).unwrap_or_else(|_| panic!("must have `{}` defined", s)) From 93fb656bdabdef88abfb505127e3fc4b85d0b4f7 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 2 Aug 2017 09:23:30 -0400 Subject: [PATCH 2/3] Directories too --- docs/ARCHITECTURE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 7ebe4820d2a..a13c8019412 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -23,7 +23,7 @@ related crates. More information about the backend is in [conduit]: https://crates.io/crates/conduit -These files have to do with the backend: +These files and directories have to do with the backend: * `build.rs` - Cargo build script * `Cargo.lock` - Locks dependencies to specific versions providing consistency across development From 25383e50843a8aa5d25684ebd37d9b1685355939 Mon Sep 17 00:00:00 2001 From: "Carol (Nichols || Goulding)" Date: Wed, 2 Aug 2017 09:35:58 -0400 Subject: [PATCH 3/3] Correct license in package.json --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 4090f476cb1..e164b315d03 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,7 @@ "bugs": { "url": "https://github.com/rust-lang/crates.io/issues" }, - "license": "MIT", + "license": "(MIT OR Apache-2.0)", "author": "", "directories": { "doc": "docs",