Skip to content

Commit 5728c5f

Browse files
committed
wip; Support systematically surveying unimplemented content features.
Signed-off-by: Zixuan James Li <[email protected]>
1 parent 7b24f6d commit 5728c5f

File tree

4 files changed

+382
-0
lines changed

4 files changed

+382
-0
lines changed

message_fixtures/chat.zulip.org.jsonl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"id": 1, "html": "<p>asd</p>"}
2+
{"id": 1, "html": "<p><span class=\"topic-mention\">@topic</span>>!</p>"}

tools/content/fetch_messages.dart

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
#!/usr/bin/env dart
2+
3+
import 'dart:convert';
4+
import 'dart:io';
5+
import 'dart:math';
6+
7+
// Avoid any Flutter-related dependencies so this can be run in the CLI.
8+
import 'package:args/args.dart';
9+
import 'package:http/http.dart';
10+
import 'package:zulip/api/backoff.dart';
11+
12+
import 'model.dart';
13+
14+
/// Fetch message contents from the specified server in bulk.
15+
///
16+
/// It outputs the message IDs and the rendered HTML contents in
17+
/// .jsonl (https://jsonlines.org) format, which can be used later
18+
/// to perform checks for discovering unimplemented features.
19+
///
20+
/// See tools/content/unimplemented_features_test.dart for more details.
21+
void main(List<String> args) async {
22+
final argParser = ArgParser();
23+
argParser.addOption(
24+
'email',
25+
help: 'The email. See https://zulip.com/api/api-keys for help.',
26+
mandatory: true,
27+
);
28+
argParser.addOption(
29+
'api-key',
30+
help: 'The API key. See https://zulip.com/api/api-keys for help.',
31+
mandatory: true,
32+
);
33+
argParser.addOption(
34+
'site',
35+
help: 'The URL of the Zulip server to fetch messages from.',
36+
valueHelp: 'https://example.zulip.com',
37+
mandatory: true,
38+
);
39+
argParser.addOption(
40+
'file',
41+
help: 'The file to output the messages to. If not given, write output to '
42+
'stdout. Otherwise, if the file exists, its format should match the '
43+
'output of the program. This will first read from the file to avoid '
44+
'duplicates, by fetching messages starting from the newest/oldest '
45+
'known message, then append the output to the end of the file.',
46+
valueHelp: 'path/to/czo.jsonl',
47+
);
48+
argParser.addOption(
49+
'count',
50+
defaultsTo: '100',
51+
help: 'The total number of messages to fetch.',
52+
);
53+
argParser.addFlag(
54+
'fetch-newer',
55+
help: 'Fetch newer messages instead of older ones. '
56+
'Only useful when --file is supplied.',
57+
defaultsTo: false,
58+
);
59+
argParser.addFlag(
60+
'help', abbr: 'h',
61+
negatable: false,
62+
help: 'Show this help message.',
63+
);
64+
65+
void printUsage() {
66+
// ignore: avoid_print
67+
print('Fetch Zulip message contents from a given server.\n'
68+
'Usage: fetch_messages --email <EMAIL> --api-key <API_KEY> --site <SERVER_URL>\n'
69+
'${argParser.usage}');
70+
}
71+
72+
Never throwWithUsage(String error) {
73+
printUsage();
74+
throw Exception('\nError: $error');
75+
}
76+
77+
final parsedArguments = argParser.parse(args);
78+
if (parsedArguments['help'] as bool) {
79+
printUsage();
80+
exit(0);
81+
}
82+
83+
final email = parsedArguments['email'] as String?;
84+
if (email == null) throwWithUsage('Option email is required');
85+
86+
final apiKey = parsedArguments['api-key'] as String?;
87+
if (apiKey == null) throwWithUsage('Option api-key is required');
88+
89+
final realmUrlStr = parsedArguments['site'] as String?;
90+
if (realmUrlStr == null) throwWithUsage('Option site is required');
91+
final realmUrl = Uri.parse(realmUrlStr);
92+
93+
final count = int.parse(parsedArguments['count'] as String);
94+
95+
final outputPath = parsedArguments['file'] as String?;
96+
final fetchNewer = parsedArguments['fetch-newer'] as bool;
97+
int? anchorMessageId;
98+
IOSink output = stdout;
99+
if (outputPath != null) {
100+
final outputFile = File(outputPath);
101+
if (!outputFile.existsSync()) {
102+
outputFile.createSync();
103+
}
104+
await for (final message in readMessagesFromJsonl(outputFile)) {
105+
// Find the newest/oldest message ID as the anchor.
106+
anchorMessageId ??= message.id;
107+
anchorMessageId = (fetchNewer ? max : min)(message.id, anchorMessageId);
108+
}
109+
output = outputFile.openWrite(mode: FileMode.writeOnlyAppend);
110+
}
111+
112+
final client = Client();
113+
final authHeader = 'Basic ${base64Encode(utf8.encode('$email:$apiKey'))}';
114+
115+
// These are working constants chosen abitrarily.
116+
const batchSize = 5000;
117+
const maxRetries = 10;
118+
const fetchInterval = Duration(seconds: 5);
119+
120+
int retries = 0;
121+
int messageToFetch = count;
122+
BackoffMachine? backoff;
123+
124+
while (messageToFetch > 0) {
125+
// Fetch messages in batches, from newer messages to older messages by
126+
// default, until there aren't any more messages to be fetched. Note that
127+
// the message IDs of Zulip messages are higher for newer messages.
128+
final currentBatchSize = (batchSize < messageToFetch) ? batchSize : messageToFetch;
129+
final _GetMessagesResult result;
130+
try {
131+
result = await _getMessages(client, realmUrl: realmUrl,
132+
authHeader: authHeader,
133+
anchorMessageId: anchorMessageId,
134+
numBefore: (!fetchNewer) ? currentBatchSize : 0,
135+
numAfter: (fetchNewer) ? currentBatchSize : 0,
136+
);
137+
} catch (e) {
138+
// We could have more fine-grained error handling and avoid retrying on
139+
// non-network-related failures, but that's skipped for now.
140+
if (retries >= maxRetries) {
141+
rethrow;
142+
}
143+
retries++;
144+
await (backoff ??= BackoffMachine()).wait();
145+
continue;
146+
}
147+
148+
final messageEntries = result.messages.map(MessageEntry.fromRawMessage);
149+
if (messageEntries.isEmpty) {
150+
if (fetchNewer) assert(result.foundNewest);
151+
if (!fetchNewer) assert(result.foundOldest);
152+
break;
153+
}
154+
155+
// Find the newest/oldest message as the next message fetch anchor.
156+
anchorMessageId = messageEntries.map((x) => x.id).reduce(fetchNewer ? max : min);
157+
messageEntries.map(jsonEncode).forEach((json) => output.writeln(json));
158+
messageToFetch -= messageEntries.length;
159+
160+
// This I/O operation could fail, but crashing is fine here.
161+
final flushFuture = output.flush();
162+
// Make sure the delay happens concurrently to the flush.
163+
if (messageToFetch > 0) await Future<void>.delayed(fetchInterval);
164+
await flushFuture;
165+
backoff = null;
166+
}
167+
exit(0);
168+
}
169+
170+
// Ported from [GetMessagesResult] to avoid depending on Flutter libraries.
171+
class _GetMessagesResult {
172+
const _GetMessagesResult(this.foundOldest, this.foundNewest, this.messages);
173+
174+
final bool foundOldest;
175+
final bool foundNewest;
176+
final List<Map<String, Object?>> messages;
177+
178+
factory _GetMessagesResult.fromJson(Map<String, Object?> json) =>
179+
_GetMessagesResult(
180+
json['found_oldest'] as bool,
181+
json['found_newest'] as bool,
182+
(json['messages'] as List<Object?>).map((x) => (x as Map<String, Object?>)).toList());
183+
}
184+
185+
Future<_GetMessagesResult> _getMessages(Client client, {
186+
required Uri realmUrl,
187+
required String authHeader,
188+
required int numBefore,
189+
required int numAfter,
190+
int? anchorMessageId,
191+
}) async {
192+
final url = realmUrl.replace(
193+
path: '/api/v1/messages',
194+
queryParameters: {
195+
'anchor': anchorMessageId != null ? jsonEncode(anchorMessageId) : 'newest',
196+
// A known anchor message already exists in the output,
197+
// so avoid fetching it again.
198+
'include_anchor': jsonEncode(anchorMessageId == null),
199+
'num_before': jsonEncode(numBefore),
200+
'num_after': jsonEncode(numAfter),
201+
'narrow': jsonEncode([{'operator': 'channels', 'operand': 'public'}]),
202+
});
203+
final StreamedResponse response;
204+
response = await client.send(
205+
Request('GET', url)..headers['Authorization'] = authHeader);
206+
final bytes = await response.stream.toBytes();
207+
final json = jsonDecode(utf8.decode(bytes)) as Map<String, dynamic>?;
208+
209+
if (response.statusCode != 200 || json == null) {
210+
// We could handle rate limiting or other error codes, but just crashing
211+
// early here should be fine for this tool.
212+
throw Exception('Failed to get messages. Code: ${response.statusCode}\nDetails: ${json ?? 'unknown'}');
213+
}
214+
return _GetMessagesResult.fromJson(json);
215+
}

tools/content/model.dart

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import 'dart:io';
2+
import 'dart:convert';
3+
4+
import 'package:json_annotation/json_annotation.dart';
5+
6+
/// A data structure representing a message.
7+
@JsonSerializable()
8+
final class MessageEntry {
9+
const MessageEntry({
10+
required this.id,
11+
required this.html,
12+
});
13+
14+
/// Selectively parses from get-message responses.
15+
///
16+
/// See also: https://zulip.com/api/get-messages#response
17+
factory MessageEntry.fromRawMessage(Map<String, Object?> json) =>
18+
MessageEntry(id: (json['id'] as num).toInt(), html: json['content'] as String);
19+
20+
factory MessageEntry.fromJson(Map<String, Object?> json) =>
21+
MessageEntry(id: (json['id'] as num).toInt(), html: json['html'] as String);
22+
23+
Map<String, Object> toJson() => {'id': id, 'html': html};
24+
25+
/// The message ID, unique within a server.
26+
final int id;
27+
28+
/// The rendered HTML of the message.
29+
final String html;
30+
}
31+
32+
/// Open the given JSON Lines file and read [MessageEntry] from it.
33+
///
34+
/// We store the entries in JSON Lines format and return them from a stream to
35+
/// avoid excessive use of memory.
36+
Stream<MessageEntry> readMessagesFromJsonl(File file) => file.openRead()
37+
.transform(utf8.decoder).transform(const LineSplitter())
38+
.map(jsonDecode).map((x) => MessageEntry.fromJson(x as Map<String, Object?>));
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
@Timeout(Duration(minutes: 10))
2+
library;
3+
4+
import 'dart:io';
5+
import 'dart:math';
6+
7+
import 'package:checks/checks.dart';
8+
import 'package:html/dom.dart' as dom;
9+
import 'package:flutter/foundation.dart';
10+
import 'package:flutter_test/flutter_test.dart';
11+
import 'package:zulip/model/content.dart';
12+
13+
import 'model.dart';
14+
15+
16+
/// Check if there are unimplemented features from the given corpuses of HTML
17+
/// contents from Zulip messages.
18+
///
19+
/// This test is meant to be run manually
20+
///
21+
/// To run it, use:
22+
///
23+
/// flutter test tools/content --dart-define=corpusDir=path/to/corpusDir
24+
///
25+
/// where `path/to/corpusDir` should be a directory containing files with
26+
/// outputs generated from tools/content/fetch_messages.dart.
27+
///
28+
/// Optionally, you can enable more details with `--dart-define=verbose=true`.
29+
///
30+
/// The test writes an overview of unimplemented features at the beginning to
31+
/// standard output, followed by the details of each feature. To look for live
32+
/// examples, you can search on the Zulip community by message ID from all
33+
/// public channels.
34+
///
35+
/// For example, a search query like "near: 12345 channels: public" would work.
36+
///
37+
/// See also:
38+
/// * lib/model/content.dart, the implementation of the content parser.
39+
/// * tools/content/fetch_messages.dart, the script that produces the corpuses.
40+
void main() async {
41+
Future<void> checkForUnimplementedFeatureInFile(File file) async {
42+
final messageIdsByFeature = <String, Set<int>>{};
43+
final contentsByFeature = <String, List<String>>{};
44+
45+
await for (final message in readMessagesFromJsonl(file)) {
46+
_walk(message.id, parseContent(message.html).toDiagnosticsNode(),
47+
messageIdsByFeature: messageIdsByFeature,
48+
contentsByFeature: contentsByFeature);
49+
}
50+
51+
// This buffer allows us to avoid using prints directly.
52+
final outputLines = <String>[];
53+
if (messageIdsByFeature.isNotEmpty) outputLines.add('Found unimplemented features:');
54+
for (final featureName in messageIdsByFeature.keys) {
55+
Set<int> messageIds = messageIdsByFeature[featureName]!;
56+
int oldestId = messageIds.reduce(min);
57+
int newestId = messageIds.reduce(max);
58+
outputLines.add('- `$featureName`\n Oldest message: $oldestId; newest message: $newestId\n');
59+
}
60+
outputLines.add('');
61+
62+
final divider = '\n\n${'=' * 80}\n\n';
63+
int unsupportedCounter = 0;
64+
for (final MapEntry(key: featureName, value: messageContents) in contentsByFeature.entries) {
65+
unsupportedCounter++;
66+
if (!_verbose) continue;
67+
outputLines.addAll([
68+
'Unsupported feature #$unsupportedCounter: $featureName',
69+
'message IDs:\n${messageIdsByFeature[featureName]!.join(', ')}',
70+
'first 10 examples:\n${messageContents.take(10).join(divider)}',
71+
'\n',
72+
]);
73+
}
74+
check(unsupportedCounter, because: outputLines.join('\n')).equals(0);
75+
}
76+
77+
final corpusFiles = _getCorpusFiles();
78+
group('Check for unimplemented features in', () {
79+
for (final file in corpusFiles) {
80+
test(file.path, () => checkForUnimplementedFeatureInFile(file));
81+
}
82+
}, skip: corpusFiles.isEmpty);
83+
}
84+
85+
// Determine whether details about all messages with unimplemented features
86+
// should be printed.
87+
const bool _verbose = bool.fromEnvironment('verbose');
88+
89+
const String _corpusDirPath = String.fromEnvironment('corpusDir');
90+
91+
Iterable<File> _getCorpusFiles() {
92+
final corpusDir = Directory(_corpusDirPath);
93+
return corpusDir.existsSync() ? corpusDir.listSync().whereType<File>() : [];
94+
}
95+
96+
/// Walk the tree looking for unimplemented nodes, and aggregate them by the
97+
/// category of the unimplemented feature.
98+
///
99+
/// This modifies `messageIdsByFeature` and `contentsByFeature` in-place.
100+
void _walk(int messageId, DiagnosticsNode node, {
101+
required Map<String, Set<int>> messageIdsByFeature,
102+
required Map<String, List<String>> contentsByFeature,
103+
}) {
104+
final value = node.value;
105+
if (value is! UnimplementedNode) {
106+
for (final child in node.getChildren()) {
107+
_walk(messageId, child,
108+
messageIdsByFeature: messageIdsByFeature,
109+
contentsByFeature: contentsByFeature);
110+
}
111+
return;
112+
}
113+
114+
final htmlNode = value.debugHtmlNode;
115+
final String featureName;
116+
if (htmlNode is dom.Element) {
117+
if (htmlNode.className.isEmpty) {
118+
featureName = '<${htmlNode.localName!}>';
119+
} else {
120+
featureName = '<${htmlNode.localName!} class="${htmlNode.classes.join(" ")}">';
121+
}
122+
} else {
123+
featureName = 'DOM node type: ${htmlNode.nodeType}';
124+
}
125+
(messageIdsByFeature[featureName] ??= {}).add(messageId);
126+
(contentsByFeature[featureName] ??= []).add(value.debugHtmlText);
127+
}

0 commit comments

Comments
 (0)