Skip to content

Commit 71a6a1a

Browse files
Merge #919
919: Fix keywords that weren't lowercased before inserting r=carols10cents Background for anyone unfamiliar with crates.io's database schema: - We've got crates - We've got keywords - Crates can have many keywords and keywords can have many crates - There's a join table `crates_keywords` that contains records associating crate.ids and keyword.ids This migration fixes invalid data that currently exists in the database. Between commits d5c5635 and 4c2bd42, it was possible for keywords to be created without lowercasing them first. Then, due to this code: ``` pub fn find_or_create_all(conn: &PgConnection, names: &[&str]) -> QueryResult<Vec<Keyword>> { // ...elided... keywords::table .filter(::lower(keywords::keyword).eq(any(&lowercase_names))) .load(conn) } ``` which selected any keywords that matched the given keywords *regardless* of case and associated these keywords with crates, many crates now have BOTH the uppercase and lowercase variants of a keyword associated with them. That is, the keywords table contains: | id | keyword | | |----|---------|-| | 1 | foo | | | 2 | Foo | this record should not exist | And a crate with id 5 that was published with any of: `keywords = ["foo"]` `keywords = ["Foo"]` `keywords = ["FOO"]` etc in their Cargo.toml metadata would get the following records created in the crates_keywords table: | crate_id | keyword_id | | |----------|------------|-| | 5 | 1 | this record is correct, for all cases | | 5 | 2 | this record is incorrect, for all cases | So this migration removes all the crates_keywords rows associated with keywords that aren't lowercased and then removes the keywords themselves. BUT! There are some crates that only have these associations currently with the uppercased keyword! That is, crate id 5 has this in crates_keywords: | crate_id | keyword_id | |----------|------------| | 5 | 2 | So BEFORE making the fix above, this migration inserts the corresponding lowercased keyword association for these crates (so 5, 1 in this case) so that these crates can be treated like all the rest that are associated with both. ALSO! There are some keywords that have only ever been used in their uppercase variant, so the corresponding lowercase keyword doesn't exist at all. Such as: | id | keyword | |----|---------| | 3 | BAR | In this case, the migration updates the keyword value, leaving the associated crates_keywords alone. WHEW!
2 parents 77931ce + 772e009 commit 71a6a1a

File tree

3 files changed

+112
-1
lines changed

3 files changed

+112
-1
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
-- Not reversible
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
-- For all keywords that have a corresponding lowercased keyword,
2+
-- (these keywords should not have been created but there was a bug
3+
-- that created them; that has been fixed and no more are being created)
4+
WITH messed_up_keywords AS (
5+
select keywords.id as upper_id, k.id as lower_id
6+
from keywords
7+
inner join keywords as k on LOWER(keywords.keyword) = k.keyword
8+
where LOWER(keywords.keyword) != keywords.keyword
9+
)
10+
-- Find all the crates that use the uppercased keyword BUT NOT the lowercased keyword
11+
-- (many crates are associated with both lower and upper cased because of a bug)
12+
, messed_up_crates AS (
13+
select crate_id, upper_id, lower_id
14+
from crates_keywords
15+
inner join messed_up_keywords on crates_keywords.keyword_id = messed_up_keywords.upper_id
16+
where messed_up_keywords.lower_id not in (
17+
select keyword_id
18+
from crates_keywords as ck
19+
where ck.crate_id = crates_keywords.crate_id
20+
)
21+
)
22+
-- Associate these crates with the lowercased keyword AS WELL AS the uppercased keyword
23+
INSERT INTO crates_keywords (crate_id, keyword_id)
24+
SELECT crate_id, lower_id as keyword_id
25+
FROM messed_up_crates
26+
;
27+
28+
-- For all keywords that have a corresponding lowercased keyword,
29+
-- (this is repeated exactly from above)
30+
WITH messed_up_keywords AS (
31+
select keywords.id as upper_id, k.id as lower_id
32+
from keywords
33+
inner join keywords as k on LOWER(keywords.keyword) = k.keyword
34+
where LOWER(keywords.keyword) != keywords.keyword
35+
)
36+
-- Delete records associating crates to the uppercased keyword
37+
DELETE
38+
FROM crates_keywords
39+
WHERE crates_keywords.keyword_id IN (
40+
SELECT upper_id FROM messed_up_keywords
41+
)
42+
;
43+
44+
-- For all keywords that have a corresponding lowercased keyword,
45+
-- (this is repeated exactly from above)
46+
WITH messed_up_keywords AS (
47+
select keywords.id as upper_id, k.id as lower_id
48+
from keywords
49+
inner join keywords as k on LOWER(keywords.keyword) = k.keyword
50+
where LOWER(keywords.keyword) != keywords.keyword
51+
)
52+
-- Delete the uppercased keyword
53+
-- No more crates should be associated with these keywords because of
54+
-- the previous delete.
55+
DELETE
56+
FROM keywords
57+
WHERE keywords.id IN (
58+
SELECT upper_id FROM messed_up_keywords
59+
)
60+
;
61+
62+
-- For all keywords that are not properly lowercased but do not
63+
-- have a corresponding lowercased keyword, update them to be
64+
-- lower cased, preserving any crate associations using them.
65+
UPDATE keywords
66+
SET keyword = lower(keyword)
67+
WHERE keyword != lower(keyword)
68+
;

src/keyword.rs

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ impl Keyword {
7171
.execute(conn)?;
7272
}
7373
keywords::table
74-
.filter(::lower(keywords::keyword).eq(any(&lowercase_names)))
74+
.filter(keywords::keyword.eq(any(&lowercase_names)))
7575
.load(conn)
7676
}
7777

@@ -189,3 +189,45 @@ pub fn show(req: &mut Request) -> CargoResult<Response> {
189189
}
190190
Ok(req.json(&R { keyword: kw.encodable() }))
191191
}
192+
193+
#[cfg(test)]
194+
mod tests {
195+
use super::*;
196+
use dotenv::dotenv;
197+
use std::env;
198+
use diesel;
199+
use diesel::connection::SimpleConnection;
200+
201+
#[derive(Insertable)]
202+
#[table_name = "keywords"]
203+
struct NewKeyword<'a> {
204+
keyword: &'a str,
205+
}
206+
207+
fn pg_connection() -> PgConnection {
208+
let _ = dotenv();
209+
let database_url =
210+
env::var("TEST_DATABASE_URL").expect("TEST_DATABASE_URL must be set to run tests");
211+
let conn = PgConnection::establish(&database_url).unwrap();
212+
// These tests deadlock if run concurrently
213+
conn.batch_execute("BEGIN;").unwrap();
214+
conn
215+
}
216+
217+
#[test]
218+
fn dont_associate_with_non_lowercased_keywords() {
219+
let conn = pg_connection();
220+
// The code should be preventing lowercased keywords from existing,
221+
// but if one happens to sneak in there, don't associate crates with it.
222+
let bad_keyword = NewKeyword { keyword: "NO" };
223+
224+
diesel::insert(&bad_keyword)
225+
.into(keywords::table)
226+
.execute(&conn)
227+
.unwrap();
228+
229+
let associated = Keyword::find_or_create_all(&conn, &["no"]).unwrap();
230+
assert_eq!(associated.len(), 1);
231+
assert_eq!(associated.first().unwrap().keyword, "no");
232+
}
233+
}

0 commit comments

Comments
 (0)