Skip to content

Commit 31cb1d2

Browse files
committed
tools/content: Support systematically surveying unimplemented content features.
We added 2 scripts. - fetch_messages.dart, the script that fetches messages from a given Zulip server, that does not depend on Flutter or other involved Zulip Flutter packages, so that it can run without Flutter. It is meant to be run first to produce the corpuses needed for surveying the unimplemented features. The fetched messages are formatted in JSON Lines format, where each individual entry is JSON containing the message ID and the rendered HTML content. The script stores output in separate files for messages from each server, because message IDs are not unique across them. - unimplemented_features_test.dart, a test that goes over all messages collected, parses then with the content parser, and report the unimplemented features it discovered. This is implemented as a test mainly because of its dependency on the content parser, which depends on the Flutter engine (and `flutter test` conveniently sets up a test device). The test can be run manually via: `flutter test --dart-define=corpusDir=path/to/corpusDir tools/content` We mostly avoid prints (https://dart.dev/tools/linter-rules/avoid_print) in both scripts. While we don't lose much by disabling this lint rule for them, because they are supposed CLI programs after all, the rule (potentially) helps with reducing developer inclination to be verbose. See comments from the scripts for more details on the implementations. Signed-off-by: Zixuan James Li <[email protected]>
1 parent 4bfc212 commit 31cb1d2

File tree

3 files changed

+403
-0
lines changed

3 files changed

+403
-0
lines changed

tools/content/fetch_messages.dart

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
#!/usr/bin/env dart
2+
3+
import 'dart:convert';
4+
import 'dart:io';
5+
import 'dart:math';
6+
7+
// Avoid any Flutter-related dependencies so this can be run as a CLI program.
8+
import 'package:args/args.dart';
9+
import 'package:http/http.dart';
10+
import 'package:ini/ini.dart' as ini;
11+
import 'package:zulip/api/backoff.dart';
12+
13+
import 'model.dart';
14+
15+
/// Fetch all public message contents from a Zulip server in bulk.
16+
///
17+
/// It outputs JSON entries of the message IDs and the rendered HTML contents in
18+
/// JSON Lines (https://jsonlines.org) format. The output can be used later to
19+
/// perform checks for discovering unimplemented features.
20+
///
21+
/// Because message IDs are only unique within a single server, the script
22+
/// names corpuses from each server differently (if --corpus-dir is specified).
23+
///
24+
/// For more help, run `tools/content/fetch_message.dart --help`.
25+
///
26+
/// See also:
27+
/// * tools/content/unimplemented_features_test.dart, which runs checks against
28+
/// the fetched corpuses.
29+
void main(List<String> args) async {
30+
final argParser = ArgParser();
31+
argParser.addOption(
32+
'config-file',
33+
help: 'A zuliprc file with identity information including email, API key\n'
34+
'and the Zulip server URL to fetch the messages from (required).\n\n'
35+
'To get the file, see\n'
36+
'https://zulip.com/api/configuring-python-bindings#download-a-zuliprc-file.',
37+
valueHelp: 'path/to/zuliprc',
38+
);
39+
argParser.addOption(
40+
'corpus-dir',
41+
help: 'The directory to look for/store the corpus file. If not given,\n'
42+
'the script will write output to stdout. Otherwise, this will\n'
43+
'first read from the existing corpus file (assumed to be named\n'
44+
'as "host name of the Zulip server.jsonl") to avoid duplicates\n'
45+
'before fetching more messages',
46+
valueHelp: 'path/to/czo.jsonl',
47+
);
48+
argParser.addFlag(
49+
'fetch-newer',
50+
help: 'Fetch newer messages instead of older ones.\n'
51+
'Only useful when there is a matching corpus file in corpus-dir.',
52+
defaultsTo: false,
53+
);
54+
argParser.addFlag(
55+
'help', abbr: 'h',
56+
negatable: false,
57+
help: 'Show this help message.',
58+
);
59+
60+
void printUsage() {
61+
// Give it a pass when printing the help message.
62+
// ignore: avoid_print
63+
print('usage: fetch_messages --config-file <CONFIG_FILE>\n\n'
64+
'Fetch message contents from a Zulip server in bulk.\n\n'
65+
'${argParser.usage}');
66+
}
67+
68+
Never throwWithUsage(String error) {
69+
printUsage();
70+
throw Exception('\nError: $error');
71+
}
72+
73+
final parsedArguments = argParser.parse(args);
74+
if (parsedArguments['help'] as bool) {
75+
printUsage();
76+
exit(0);
77+
}
78+
79+
final zuliprc = parsedArguments['config-file'] as String?;
80+
if (zuliprc == null) {
81+
throwWithUsage('"config-file is required');
82+
}
83+
84+
final configFile = File(zuliprc);
85+
if (!configFile.existsSync()) {
86+
throwWithUsage('Config file "$zuliprc" does not exist');
87+
}
88+
89+
// `zuliprc` is a file in INI format containing the user's identity
90+
// information.
91+
//
92+
// See also:
93+
// https://zulip.com/api/configuring-python-bindings#configuration-keys-and-environment-variables
94+
final parsedConfig = ini.Config.fromString(configFile.readAsStringSync());
95+
final email = parsedConfig.get('api', 'email') as String;
96+
final apiKey = parsedConfig.get('api', 'key') as String;
97+
final site = Uri.parse(parsedConfig.get('api', 'site') as String);
98+
99+
final outputDirStr = parsedArguments['corpus-dir'] as String?;
100+
final fetchNewer = parsedArguments['fetch-newer'] as bool;
101+
int? anchorMessageId;
102+
IOSink output = stdout;
103+
if (outputDirStr != null) {
104+
final outputDir = Directory(outputDirStr);
105+
outputDir.createSync(recursive: true);
106+
final outputFile = File('$outputDirStr/${site.host}.jsonl');
107+
if (!outputFile.existsSync()) outputFile.createSync();
108+
// Look for the known newest/oldest message so that we can continue
109+
// fetching from where we left off.
110+
await for (final message in readMessagesFromJsonl(outputFile)) {
111+
anchorMessageId ??= message.id;
112+
// Newer Zulip messages have higher message IDs.
113+
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId);
114+
}
115+
output = outputFile.openWrite(mode: FileMode.writeOnlyAppend);
116+
}
117+
118+
final client = Client();
119+
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}';
120+
121+
// These are working constants chosen abitrarily.
122+
const batchSize = 5000;
123+
const maxRetries = 10;
124+
const fetchInterval = Duration(seconds: 5);
125+
126+
int retries = 0;
127+
BackoffMachine? backoff;
128+
129+
while (true) {
130+
// This loops until there is no message fetched in an iteration.
131+
final _GetMessagesResult result;
132+
try {
133+
result = await _getMessages(client, realmUrl: site,
134+
authHeader: authHeader,
135+
anchorMessageId: anchorMessageId,
136+
numBefore: (!fetchNewer) ? batchSize : 0,
137+
numAfter: (fetchNewer) ? batchSize : 0,
138+
);
139+
} catch (e) {
140+
// We could have more fine-grained error handling and avoid retrying on
141+
// non-network-related failures, but that's not necessary.
142+
if (retries >= maxRetries) {
143+
rethrow;
144+
}
145+
retries++;
146+
await (backoff ??= BackoffMachine()).wait();
147+
continue;
148+
}
149+
150+
final messageEntries = result.messages.map(MessageEntry.fromJson);
151+
if (messageEntries.isEmpty) {
152+
// Sanity check to ensure that the server agrees
153+
// there is no more messages to fetch.
154+
if (fetchNewer) assert(result.foundNewest);
155+
if (!fetchNewer) assert(result.foundOldest);
156+
break;
157+
}
158+
159+
// Find and use the newest/oldest message as the next message fetch anchor.
160+
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min);
161+
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json));
162+
163+
// This I/O operation could fail, but crashing is fine here.
164+
final flushFuture = output.flush();
165+
// Make sure the delay happens concurrently to the flush.
166+
await Future<void>.delayed(fetchInterval);
167+
await flushFuture;
168+
backoff = null;
169+
}
170+
exit(0);
171+
}
172+
173+
/// https://zulip.com/api/get-messages#response
174+
// Partially ported from [GetMessagesResult] to avoid depending on Flutter libraries.
175+
class _GetMessagesResult {
176+
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages);
177+
178+
final bool foundOldest;
179+
final bool foundNewest;
180+
final List<Map<String, Object?>> messages;
181+
182+
factory _GetMessagesResult.fromJson(Map<String, Object?> json) =>
183+
_GetMessagesResult(
184+
json['found_oldest'] as bool,
185+
json['found_newest'] as bool,
186+
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList());
187+
}
188+
189+
/// https://zulip.com/api/get-messages
190+
Future<_GetMessagesResult> _getMessages(Client client, {
191+
required Uri realmUrl,
192+
required String authHeader,
193+
required int numBefore,
194+
required int numAfter,
195+
int? anchorMessageId,
196+
}) async {
197+
final url = realmUrl.replace(
198+
path: '/api/v1/messages',
199+
queryParameters: {
200+
// This fallback will only be used when first fetching from a server.
201+
'anchor': anchorMessageId != null ? jsonEncode(anchorMessageId) : 'newest',
202+
// The anchor message already exists in the corpus,
203+
// so avoid fetching it again.
204+
'include_anchor': jsonEncode(anchorMessageId == null),
205+
'num_before': jsonEncode(numBefore),
206+
'num_after': jsonEncode(numAfter),
207+
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]),
208+
});
209+
final response = await client.send(
210+
Request('GET', url)..headers['Authorization'] = authHeader);
211+
final bytes = await response.stream.toBytes();
212+
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?;
213+
214+
if (response.statusCode != 200 || json == null) {
215+
// Just crashing early here should be fine for this tool. We don't need
216+
// to handle the specific error codes.
217+
throw Exception('Failed to get messages. Code: ${response.statusCode}\n'
218+
'Details: ${json ?? 'unknown'}');
219+
}
220+
return _GetMessagesResult.fromJson(json);
221+
}

tools/content/model.dart

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import 'dart:io';
2+
import 'dart:convert';
3+
4+
import 'package:json_annotation/json_annotation.dart';
5+
6+
/// A data structure representing a message.
7+
@JsonSerializable()
8+
final class MessageEntry {
9+
const MessageEntry({
10+
required this.id,
11+
required this.content,
12+
});
13+
14+
/// Selectively parses from get-message responses.
15+
///
16+
/// See also: https://zulip.com/api/get-messages#response
17+
factory MessageEntry.fromJson(Map<String, Object?> json) =>
18+
MessageEntry(id: (json['id'] as num).toInt(), content: json['content'] as String);
19+
20+
Map<String, Object> toJson() => {'id': id, 'content': content};
21+
22+
/// The message ID, unique within a server.
23+
final int id;
24+
25+
/// The rendered HTML of the message.
26+
final String content;
27+
}
28+
29+
/// Open the given JSON Lines file and read [MessageEntry]'s from it.
30+
///
31+
/// We store the entries in JSON Lines format and return them from a stream to
32+
/// avoid excessive use of memory.
33+
Stream<MessageEntry> readMessagesFromJsonl(File file) => file.openRead()
34+
.transform(utf8.decoder).transform(const LineSplitter())
35+
.map(jsonDecode).map((x) => MessageEntry.fromJson(x as Map<String, Object?>));

0 commit comments

Comments
 (0)