Skip to content

Commit 3e92404

Browse files
committed
[FIX] spreadsheet: batch process spreadsheet_revision.commands
Some dbs have `spreadsheet_revision` records with over 10 millions characters in `commands`. If the number of record is high, this leads to memory errors. We distribute them in buckets of `memory_cap` maximum size, and use a named cursor to process them in buckets. Commands larger than `memory_cap` fit in one bucket.
1 parent 86409e5 commit 3e92404

File tree

1 file changed

+49
-23
lines changed

1 file changed

+49
-23
lines changed

src/util/spreadsheet/misc.py

Lines changed: 49 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,58 @@
1-
from .. import json
1+
from .. import json, pg
2+
3+
MEMORY_CAP = 2 * 10**8 # 200MB
24

35

46
def iter_commands(cr, like_all=(), like_any=()):
57
if not (bool(like_all) ^ bool(like_any)):
68
raise ValueError("Please specify `like_all` or `like_any`, not both")
7-
cr.execute(
8-
"""
9-
SELECT id,
10-
commands
11-
FROM spreadsheet_revision
12-
WHERE commands LIKE {}(%s::text[])
13-
""".format("ALL" if like_all else "ANY"),
14-
[list(like_all or like_any)],
15-
)
16-
for revision_id, data in cr.fetchall():
17-
data_loaded = json.loads(data)
18-
if "commands" not in data_loaded:
19-
continue
20-
data_old = json.dumps(data_loaded, sort_keys=True)
21-
22-
changed = yield data_loaded["commands"]
23-
if changed is None:
24-
changed = data_old != json.dumps(data_loaded, sort_keys=True)
25-
26-
if changed:
27-
cr.execute(
28-
"UPDATE spreadsheet_revision SET commands=%s WHERE id=%s", [json.dumps(data_loaded), revision_id]
9+
10+
with pg.named_cursor(cr, itersize=1) as ncr:
11+
ncr.execute(
12+
"""
13+
WITH filtered AS (
14+
SELECT id,
15+
commands,
16+
LENGTH(commands) AS commands_length
17+
FROM spreadsheet_revision
18+
WHERE commands LIKE {condition} (%s::text[])
19+
), smaller AS (
20+
SELECT id,
21+
commands,
22+
sum(commands_length) OVER (ORDER BY id) / %s AS num
23+
FROM filtered
24+
WHERE commands_length <= %s
2925
)
26+
SELECT array_agg(id ORDER BY id),
27+
array_agg(commands ORDER BY id)
28+
FROM smaller
29+
GROUP BY num
30+
31+
UNION ALL
32+
33+
SELECT ARRAY[id],
34+
ARRAY[commands]
35+
FROM filtered
36+
WHERE commands_length > %s
37+
""".format(condition=pg.SQLStr("ALL" if like_all else "ANY")),
38+
[list(like_any or like_all), MEMORY_CAP, MEMORY_CAP, MEMORY_CAP],
39+
)
40+
for ids, commands in ncr:
41+
for revision_id, data in zip(ids, commands):
42+
data_loaded = json.loads(data)
43+
if "commands" not in data_loaded:
44+
continue
45+
data_old = json.dumps(data_loaded, sort_keys=True)
46+
47+
changed = yield data_loaded["commands"]
48+
if changed is None:
49+
changed = data_old != json.dumps(data_loaded, sort_keys=True)
50+
51+
if changed:
52+
cr.execute(
53+
"UPDATE spreadsheet_revision SET commands=%s WHERE id=%s",
54+
[json.dumps(data_loaded), revision_id],
55+
)
3056

3157

3258
def process_commands(cr, callback, *args, **kwargs):

0 commit comments

Comments
 (0)