Skip to content

Commit b0d8d5a

Browse files
committed
tools/content: Support systematically surveying unimplemented content features.
We added 2 scripts. - fetch_messages.dart, the script that fetches messages from a given Zulip server, that does not depend on Flutter or other involved Zulip Flutter packages, so that it can run without Flutter. It is meant to be run first to produce the corpuses needed for surveying the unimplemented features. The fetched messages are formatted in JSON Lines format, where each individual entry is JSON containing the message ID and the rendered HTML content. The user is encouraged to have a separate file for messages from each server, because message IDs are not unique across them. - unimplemented_features_test.dart, a test that goes over all messages collected, parses then with the content parser, and report the unimplemented features it discovered. This is implemented as a test mainly because of its dependency on the content parser, which depends on Flutter. It has be run manually via: `flutter test --dart-define=corpusDir=path/to/corpusDir tools/content` See comments from the file for more instructions. Fixes: #190 Signed-off-by: Zixuan James Li <[email protected]>
1 parent 7b24f6d commit b0d8d5a

File tree

3 files changed

+389
-0
lines changed

3 files changed

+389
-0
lines changed

tools/content/fetch_messages.dart

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#!/usr/bin/env dart
2+
3+
import 'dart:convert';
4+
import 'dart:io';
5+
import 'dart:math';
6+
7+
// Avoid any Flutter-related dependencies so this can be run as a CLI program.
8+
import 'package:args/args.dart';
9+
import 'package:http/http.dart';
10+
import 'package:zulip/api/backoff.dart';
11+
12+
import 'model.dart';
13+
14+
/// Fetch message contents from the specified Zulip server in bulk.
15+
///
16+
/// It outputs JSON entries of the message IDs and the rendered HTML contents in
17+
/// JSON Lines (https://jsonlines.org) format. The output can be used later to
18+
/// perform checks for discovering unimplemented features.
19+
///
20+
/// Because message IDs are only unique within a single server, it is encouraged
21+
/// to store corpuses from each server separately to avoid confusion when
22+
/// identifying messages.
23+
///
24+
/// See tools/content/unimplemented_features_test.dart for more details.
25+
void main(List<String> args) async {
26+
final argParser = ArgParser();
27+
argParser.addOption(
28+
'email',
29+
help: 'The email. See https://zulip.com/api/api-keys for help.',
30+
mandatory: true,
31+
);
32+
argParser.addOption(
33+
'api-key',
34+
help: 'The API key. See https://zulip.com/api/api-keys for help.',
35+
mandatory: true,
36+
);
37+
argParser.addOption(
38+
'site',
39+
help: 'The URL of the Zulip server to fetch messages from.',
40+
valueHelp: 'https://example.zulip.com',
41+
mandatory: true,
42+
);
43+
argParser.addOption(
44+
'file',
45+
help: 'The file to output the messages to. If not given, write output to\n'
46+
'stdout. Otherwise, if the file exists, its format should match the\n'
47+
'output of the program. This will first read from the file to avoid\n'
48+
'duplicates, by fetching messages starting from the newest/oldest\n'
49+
'known message, then append the output to the end of the file.',
50+
valueHelp: 'path/to/czo.jsonl',
51+
);
52+
argParser.addOption(
53+
'count',
54+
defaultsTo: '100',
55+
help: 'The total number of messages to fetch.',
56+
);
57+
argParser.addFlag(
58+
'fetch-newer',
59+
help: 'Fetch newer messages instead of older ones.\n'
60+
'Only useful when --file is supplied.',
61+
defaultsTo: false,
62+
);
63+
argParser.addFlag(
64+
'help', abbr: 'h',
65+
negatable: false,
66+
help: 'Show this help message.',
67+
);
68+
69+
void printUsage() {
70+
// Give it a pass when printing the help message.
71+
// ignore: avoid_print
72+
print('usage: fetch_messages --email <EMAIL> --api-key <API_KEY> --site <SERVER_URL>\n\n'
73+
'Fetch message contents from the specified Zulip server in bulk.\n\n'
74+
'${argParser.usage}');
75+
}
76+
77+
Never throwWithUsage(String error) {
78+
printUsage();
79+
throw Exception('\nError: $error');
80+
}
81+
82+
final parsedArguments = argParser.parse(args);
83+
if (parsedArguments['help'] as bool) {
84+
printUsage();
85+
exit(0);
86+
}
87+
88+
final email = parsedArguments['email'] as String?;
89+
if (email == null) throwWithUsage('Option email is required');
90+
91+
final apiKey = parsedArguments['api-key'] as String?;
92+
if (apiKey == null) throwWithUsage('Option api-key is required');
93+
94+
final realmUrlStr = parsedArguments['site'] as String?;
95+
if (realmUrlStr == null) throwWithUsage('Option site is required');
96+
final realmUrl = Uri.parse(realmUrlStr);
97+
98+
final count = int.parse(parsedArguments['count'] as String);
99+
100+
final outputPath = parsedArguments['file'] as String?;
101+
final fetchNewer = parsedArguments['fetch-newer'] as bool;
102+
int? anchorMessageId;
103+
IOSink output = stdout;
104+
if (outputPath != null) {
105+
final outputFile = File(outputPath);
106+
if (!outputFile.existsSync()) {
107+
outputFile.createSync();
108+
}
109+
await for (final message in readMessagesFromJsonl(outputFile)) {
110+
// Find the newest/oldest message ID as the anchor.
111+
anchorMessageId ??= message.id;
112+
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId);
113+
}
114+
output = outputFile.openWrite(mode: FileMode.writeOnlyAppend);
115+
}
116+
117+
final client = Client();
118+
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}';
119+
120+
// These are working constants chosen abitrarily.
121+
const batchSize = 5000;
122+
const maxRetries = 10;
123+
const fetchInterval = Duration(seconds: 5);
124+
125+
int retries = 0;
126+
int messageToFetch = count;
127+
BackoffMachine? backoff;
128+
129+
while (messageToFetch > 0) {
130+
// Fetch messages in batches from newer messages to older messages by
131+
// default, until there aren't any more messages to be fetched.
132+
// Note that newer Zulip messages have higher IDs.
133+
final currentBatchSize = (batchSize < messageToFetch) ? batchSize : messageToFetch;
134+
final _GetMessagesResult result;
135+
try {
136+
result = await _getMessages(client, realmUrl: realmUrl,
137+
authHeader: authHeader,
138+
anchorMessageId: anchorMessageId,
139+
numBefore: (!fetchNewer) ? currentBatchSize : 0,
140+
numAfter: (fetchNewer) ? currentBatchSize : 0,
141+
);
142+
} catch (e) {
143+
// We could have more fine-grained error handling and avoid retrying on
144+
// non-network-related failures, but that's skipped for now.
145+
if (retries >= maxRetries) {
146+
rethrow;
147+
}
148+
retries++;
149+
await (backoff ??= BackoffMachine()).wait();
150+
continue;
151+
}
152+
153+
final messageEntries = result.messages.map(MessageEntry.fromRawMessage);
154+
if (messageEntries.isEmpty) {
155+
if (fetchNewer) assert(result.foundNewest);
156+
if (!fetchNewer) assert(result.foundOldest);
157+
break;
158+
}
159+
160+
// Find and use the newest/oldest message as the next message fetch anchor.
161+
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min);
162+
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json));
163+
messageToFetch -= messageEntries.length;
164+
165+
// This I/O operation could fail, but crashing is fine here.
166+
final flushFuture = output.flush();
167+
// Make sure the delay happens concurrently to the flush.
168+
if (messageToFetch > 0) await Future<void>.delayed(fetchInterval);
169+
await flushFuture;
170+
backoff = null;
171+
}
172+
exit(0);
173+
}
174+
175+
/// https://zulip.com/api/get-messages#response
176+
// Ported from [GetMessagesResult] to avoid depending on Flutter libraries.
177+
class _GetMessagesResult {
178+
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages);
179+
180+
final bool foundOldest;
181+
final bool foundNewest;
182+
final List<Map<String, Object?>> messages;
183+
184+
factory _GetMessagesResult.fromJson(Map<String, Object?> json) =>
185+
_GetMessagesResult(
186+
json['found_oldest'] as bool,
187+
json['found_newest'] as bool,
188+
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList());
189+
}
190+
191+
Future<_GetMessagesResult> _getMessages(Client client, {
192+
required Uri realmUrl,
193+
required String authHeader,
194+
required int numBefore,
195+
required int numAfter,
196+
int? anchorMessageId,
197+
}) async {
198+
final url = realmUrl.replace(
199+
path: '/api/v1/messages',
200+
queryParameters: {
201+
// This fallback will be used when there is no file given,
202+
// and there is no known messages.
203+
'anchor': anchorMessageId != null ? jsonEncode(anchorMessageId) : 'newest',
204+
// A known anchor message already exists in the output,
205+
// so avoid fetching it again.
206+
'include_anchor': jsonEncode(anchorMessageId == null),
207+
'num_before': jsonEncode(numBefore),
208+
'num_after': jsonEncode(numAfter),
209+
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]),
210+
});
211+
final response = await client.send(
212+
Request('GET', url)..headers['Authorization'] = authHeader);
213+
final bytes = await response.stream.toBytes();
214+
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?;
215+
216+
if (response.statusCode != 200 || json == null) {
217+
// We could handle rate limiting or other error codes, but just crashing
218+
// early here should be fine for this tool.
219+
throw Exception('Failed to get messages. Code: ${response.statusCode}\nDetails: ${json ?? 'unknown'}');
220+
}
221+
return _GetMessagesResult.fromJson(json);
222+
}

tools/content/model.dart

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import 'dart:io';
2+
import 'dart:convert';
3+
4+
import 'package:json_annotation/json_annotation.dart';
5+
6+
/// A data structure representing a message.
7+
@JsonSerializable()
8+
final class MessageEntry {
9+
const MessageEntry({
10+
required this.id,
11+
required this.html,
12+
});
13+
14+
/// Selectively parses from get-message responses.
15+
///
16+
/// See also: https://zulip.com/api/get-messages#response
17+
factory MessageEntry.fromRawMessage(Map<String, Object?> json) =>
18+
MessageEntry(id: (json['id'] as num).toInt(), html: json['content'] as String);
19+
20+
factory MessageEntry.fromJson(Map<String, Object?> json) =>
21+
MessageEntry(id: (json['id'] as num).toInt(), html: json['html'] as String);
22+
23+
Map<String, Object> toJson() => {'id': id, 'html': html};
24+
25+
/// The message ID, unique within a server.
26+
final int id;
27+
28+
/// The rendered HTML of the message.
29+
final String html;
30+
}
31+
32+
/// Open the given JSON Lines file and read [MessageEntry] from it.
33+
///
34+
/// We store the entries in JSON Lines format and return them from a stream to
35+
/// avoid excessive use of memory.
36+
Stream<MessageEntry> readMessagesFromJsonl(File file) => file.openRead()
37+
.transform(utf8.decoder).transform(const LineSplitter())
38+
.map(jsonDecode).map((x) => MessageEntry.fromJson(x as Map<String, Object?>));
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
@Timeout(Duration(minutes: 10))
2+
library;
3+
4+
import 'dart:io';
5+
import 'dart:math';
6+
7+
import 'package:checks/checks.dart';
8+
import 'package:html/dom.dart' as dom;
9+
import 'package:flutter/foundation.dart';
10+
import 'package:flutter_test/flutter_test.dart';
11+
import 'package:zulip/model/content.dart';
12+
13+
import 'model.dart';
14+
15+
16+
/// Check if there are unimplemented features from the given corpuses of HTML
17+
/// contents from Zulip messages.
18+
///
19+
/// This test is meant to be manually run.
20+
///
21+
/// To run it, use:
22+
///
23+
/// flutter test tools/content --dart-define=corpusDir=path/to/corpusDir
24+
///
25+
/// where `path/to/corpusDir` should be a directory containing files with
26+
/// outputs generated from tools/content/fetch_messages.dart.
27+
///
28+
/// Optionally, you can enable more details with `--dart-define=verbose=true`.
29+
///
30+
/// The test writes an overview of unimplemented features at the beginning to
31+
/// standard output, followed by the details of each feature. To look for live
32+
/// examples, you can search on the Zulip community by message ID from all
33+
/// public channels.
34+
///
35+
/// For example, a search query like "near: 12345 channels: public" would work.
36+
///
37+
/// See also:
38+
/// * lib/model/content.dart, the implementation of the content parser.
39+
/// * tools/content/fetch_messages.dart, the script that produces the corpuses.
40+
void main() async {
41+
Future<void> checkForUnimplementedFeatureInFile(File file) async {
42+
final messageIdsByFeature = <String, Set<int>>{};
43+
final contentsByFeature = <String, List<String>>{};
44+
45+
await for (final message in readMessagesFromJsonl(file)) {
46+
_walk(message.id, parseContent(message.html).toDiagnosticsNode(),
47+
messageIdsByFeature: messageIdsByFeature,
48+
contentsByFeature: contentsByFeature);
49+
}
50+
51+
// This buffer allows us to avoid using prints directly.
52+
final outputLines = <String>[];
53+
if (messageIdsByFeature.isNotEmpty) outputLines.add('Found unimplemented features:');
54+
for (final featureName in messageIdsByFeature.keys) {
55+
Set<int> messageIds = messageIdsByFeature[featureName]!;
56+
int oldestId = messageIds.reduce(min);
57+
int newestId = messageIds.reduce(max);
58+
outputLines.add('- `$featureName`\n Oldest message: $oldestId; newest message: $newestId\n');
59+
}
60+
outputLines.add('');
61+
62+
final divider = '\n\n${'=' * 80}\n\n';
63+
int unsupportedCounter = 0;
64+
for (final MapEntry(key: featureName, value: messageContents) in contentsByFeature.entries) {
65+
unsupportedCounter++;
66+
if (!_verbose) continue;
67+
outputLines.addAll([
68+
'Unsupported feature #$unsupportedCounter: $featureName',
69+
'message IDs:\n${messageIdsByFeature[featureName]!.join(', ')}',
70+
'first 10 examples:\n${messageContents.take(10).join(divider)}',
71+
'\n',
72+
]);
73+
}
74+
check(unsupportedCounter, because: outputLines.join('\n')).equals(0);
75+
}
76+
77+
final corpusFiles = _getCorpusFiles();
78+
group('Check for unimplemented features in', () {
79+
for (final file in corpusFiles) {
80+
test(file.path, () => checkForUnimplementedFeatureInFile(file));
81+
}
82+
}, skip: corpusFiles.isEmpty);
83+
}
84+
85+
// Determine whether details about all messages with unimplemented features
86+
// should be printed.
87+
const bool _verbose = bool.fromEnvironment('verbose');
88+
89+
const String _corpusDirPath = String.fromEnvironment('corpusDir');
90+
91+
Iterable<File> _getCorpusFiles() {
92+
final corpusDir = Directory(_corpusDirPath);
93+
return corpusDir.existsSync() ? corpusDir.listSync().whereType<File>() : [];
94+
}
95+
96+
/// Walk the tree looking for unimplemented nodes, and aggregate them by the
97+
/// category of the unimplemented feature.
98+
///
99+
/// This modifies `messageIdsByFeature` and `contentsByFeature` in-place.
100+
void _walk(int messageId, DiagnosticsNode node, {
101+
required Map<String, Set<int>> messageIdsByFeature,
102+
required Map<String, List<String>> contentsByFeature,
103+
}) {
104+
final value = node.value;
105+
if (value is! UnimplementedNode) {
106+
for (final child in node.getChildren()) {
107+
_walk(messageId, child,
108+
messageIdsByFeature: messageIdsByFeature,
109+
contentsByFeature: contentsByFeature);
110+
}
111+
return;
112+
}
113+
114+
// `featureName` is a prettified identifier used for categorizing
115+
// unimplemented features that are likely closely related.
116+
final String featureName;
117+
final htmlNode = value.debugHtmlNode;
118+
if (htmlNode is dom.Element) {
119+
if (htmlNode.className.isEmpty) {
120+
featureName = '<${htmlNode.localName!}>';
121+
} else {
122+
featureName = '<${htmlNode.localName!} class="${htmlNode.classes.join(" ")}">';
123+
}
124+
} else {
125+
featureName = 'DOM node type: ${htmlNode.nodeType}';
126+
}
127+
(messageIdsByFeature[featureName] ??= {}).add(messageId);
128+
(contentsByFeature[featureName] ??= []).add(value.debugHtmlText);
129+
}

0 commit comments

Comments
 (0)