@@ -20,7 +20,7 @@ use std::collections::{HashMap, HashSet};
20
20
21
21
use once_cell:: sync:: Lazy ;
22
22
use regex:: Regex ;
23
- use serde:: Deserialize ;
23
+ use serde:: { Deserialize , Deserializer } ;
24
24
use serde_json:: { Map , Value } ;
25
25
use tracing:: { error, warn} ;
26
26
@@ -35,13 +35,35 @@ pub static KNOWN_SCHEMA_LIST: Lazy<EventProcessor> =
35
35
#[ error( "Event is not in the expected text/JSON format for {0}" ) ]
36
36
pub struct Unacceptable ( String ) ;
37
37
38
+ /// Deserializes a string pattern into a compiled Regex
39
+ /// NOTE: we only warn if the pattern doesn't compile
40
+ pub fn deserialize_regex < ' de , D > ( deserializer : D ) -> Result < Option < Regex > , D :: Error >
41
+ where
42
+ D : Deserializer < ' de > ,
43
+ {
44
+ let pattern = String :: deserialize ( deserializer) ?;
45
+
46
+ let regex = Regex :: new ( & pattern)
47
+ . inspect_err ( |err| error ! ( "Error compiling regex pattern: {err}; Pattern: {pattern}" ) )
48
+ . ok ( ) ;
49
+
50
+ Ok ( regex)
51
+ }
52
+
53
+ /// Configuration for a single pattern within a log format
54
+ #[ derive( Debug , Default , Deserialize ) ]
55
+ struct Pattern {
56
+ /// Regular expression pattern used to match and capture fields from log strings
57
+ #[ serde( deserialize_with = "deserialize_regex" ) ]
58
+ pattern : Option < Regex > ,
59
+ // Maps field names to regex capture groups
60
+ fields : HashSet < String > ,
61
+ }
62
+
38
63
/// Defines a schema for extracting structured data from logs using regular expressions
39
64
#[ derive( Debug , Default ) ]
40
65
pub struct SchemaDefinition {
41
- /// Regular expression pattern used to match and capture fields from log strings
42
- patterns : Vec < Regex > ,
43
- // Maps field names to regex capture groups
44
- field_mappings : Vec < HashSet < String > > ,
66
+ patterns : Vec < Pattern > ,
45
67
}
46
68
47
69
impl SchemaDefinition {
@@ -55,36 +77,40 @@ impl SchemaDefinition {
55
77
/// * `extract_log` - Optional field name containing the raw log text
56
78
///
57
79
/// # Returns
58
- /// * `true` - If all expected fields are already present in the object OR if extraction was successful
59
- /// * `false` - If extraction failed or no pattern was available and fields were missing
80
+ /// * `Some` - If all expected fields are already present in the object OR if extraction was successful
81
+ /// Contains fields present in catch group
82
+ /// * `None` - If extraction failed or no pattern was available and fields were missing
60
83
pub fn check_or_extract (
61
84
& self ,
62
85
obj : & mut Map < String , Value > ,
63
86
extract_log : Option < & str > ,
64
- ) -> bool {
65
- if self
66
- . field_mappings
87
+ ) -> Option < HashSet < String > > {
88
+ if let Some ( pattern ) = self
89
+ . patterns
67
90
. iter ( )
68
- . any ( |fields| fields. iter ( ) . all ( |field| obj. contains_key ( field) ) )
91
+ . find ( |pattern| pattern . fields . iter ( ) . all ( |field| obj. contains_key ( field) ) )
69
92
{
70
- return true ;
93
+ return Some ( pattern . fields . clone ( ) ) ;
71
94
}
72
95
73
96
let Some ( event) = extract_log
74
97
. and_then ( |field| obj. get ( field) )
75
98
. and_then ( |s| s. as_str ( ) )
76
99
else {
77
- return false ;
100
+ return None ;
78
101
} ;
79
102
80
- for pattern in self . patterns . iter ( ) {
103
+ for format in self . patterns . iter ( ) {
104
+ let Some ( pattern) = format. pattern . as_ref ( ) else {
105
+ continue ;
106
+ } ;
81
107
let Some ( captures) = pattern. captures ( event) else {
82
108
continue ;
83
109
} ;
84
110
let mut extracted_fields = Map :: new ( ) ;
85
111
86
112
// With named capture groups, you can iterate over the field names
87
- for field_name in self . field_mappings . iter ( ) . flatten ( ) {
113
+ for field_name in format . fields . iter ( ) {
88
114
if let Some ( value) = captures. name ( field_name) {
89
115
extracted_fields. insert (
90
116
field_name. to_owned ( ) ,
@@ -95,10 +121,10 @@ impl SchemaDefinition {
95
121
96
122
obj. extend ( extracted_fields) ;
97
123
98
- return true ;
124
+ return Some ( format . fields . clone ( ) ) ;
99
125
}
100
126
101
- false
127
+ None
102
128
}
103
129
}
104
130
@@ -109,13 +135,6 @@ struct Format {
109
135
regex : Vec < Pattern > ,
110
136
}
111
137
112
- /// Configuration for a single pattern within a log format
113
- #[ derive( Debug , Deserialize ) ]
114
- struct Pattern {
115
- pattern : Option < String > ,
116
- fields : HashSet < String > ,
117
- }
118
-
119
138
/// Manages a collection of schema definitions for various log formats
120
139
#[ derive( Debug ) ]
121
140
pub struct EventProcessor {
@@ -140,18 +159,7 @@ impl EventProcessor {
140
159
. entry ( format. name . clone ( ) )
141
160
. or_insert_with ( SchemaDefinition :: default) ;
142
161
143
- schema. field_mappings . push ( regex. fields . clone ( ) ) ;
144
- // Compile the regex pattern if present
145
- // NOTE: we only warn if the pattern doesn't compile
146
- if let Some ( pattern) = regex. pattern . and_then ( |pattern| {
147
- Regex :: new ( & pattern)
148
- . inspect_err ( |err| {
149
- error ! ( "Error compiling regex pattern: {err}; Pattern: {pattern}" )
150
- } )
151
- . ok ( )
152
- } ) {
153
- schema. patterns . push ( pattern) ;
154
- }
162
+ schema. patterns . push ( regex) ;
155
163
}
156
164
}
157
165
@@ -173,32 +181,37 @@ impl EventProcessor {
173
181
json : & mut Value ,
174
182
log_source : & str ,
175
183
extract_log : Option < & str > ,
176
- ) -> Result < ( ) , Unacceptable > {
184
+ ) -> Result < HashSet < String > , Unacceptable > {
177
185
let Some ( schema) = self . schema_definitions . get ( log_source) else {
178
186
warn ! ( "Unknown log format: {log_source}" ) ;
179
- return Ok ( ( ) ) ;
187
+ return Ok ( Default :: default ( ) ) ;
180
188
} ;
181
189
190
+ let mut fields = HashSet :: new ( ) ;
182
191
match json {
183
192
Value :: Array ( list) => {
184
193
for event in list {
185
194
let Value :: Object ( event) = event else {
186
195
continue ;
187
196
} ;
188
- if !schema. check_or_extract ( event, extract_log) {
197
+ if let Some ( known_fields) = schema. check_or_extract ( event, extract_log) {
198
+ fields. extend ( known_fields) ;
199
+ } else {
189
200
return Err ( Unacceptable ( log_source. to_owned ( ) ) ) ;
190
201
}
191
202
}
192
203
}
193
204
Value :: Object ( event) => {
194
- if !schema. check_or_extract ( event, extract_log) {
205
+ if let Some ( known_fields) = schema. check_or_extract ( event, extract_log) {
206
+ return Ok ( known_fields) ;
207
+ } else {
195
208
return Err ( Unacceptable ( log_source. to_owned ( ) ) ) ;
196
209
}
197
210
}
198
211
_ => unreachable ! ( "We don't accept events of the form: {json}" ) ,
199
212
}
200
213
201
- Ok ( ( ) )
214
+ Ok ( fields )
202
215
}
203
216
}
204
217
@@ -244,7 +257,7 @@ mod tests {
244
257
245
258
// Use check_or_extract instead of extract
246
259
let result = schema. check_or_extract ( & mut obj, Some ( log_field) ) ;
247
- assert ! ( result, "Failed to extract fields from valid log" ) ;
260
+ assert ! ( result. is_some ( ) , "Failed to extract fields from valid log" ) ;
248
261
249
262
// Verify extracted fields were added to the object
250
263
assert_eq ! ( obj. get( "ip" ) . unwrap( ) . as_str( ) . unwrap( ) , "192.168.1.1" ) ;
@@ -275,7 +288,7 @@ mod tests {
275
288
276
289
// Use check_or_extract instead of extract
277
290
let result = schema. check_or_extract ( & mut obj, Some ( log_field) ) ;
278
- assert ! ( result, "Failed to extract fields from valid log" ) ;
291
+ assert ! ( result. is_some ( ) , "Failed to extract fields from valid log" ) ;
279
292
280
293
// Verify extracted fields were added to the object
281
294
assert_eq ! ( obj. get( "level" ) . unwrap( ) . as_str( ) . unwrap( ) , "ERROR" ) ;
@@ -308,7 +321,10 @@ mod tests {
308
321
309
322
// check_or_extract should return true without modifying anything
310
323
let result = schema. check_or_extract ( & mut obj, None ) ;
311
- assert ! ( result, "Should return true when fields already exist" ) ;
324
+ assert ! (
325
+ result. is_some( ) ,
326
+ "Should return true when fields already exist"
327
+ ) ;
312
328
313
329
// Verify the original values weren't changed
314
330
assert_eq ! (
@@ -332,7 +348,10 @@ mod tests {
332
348
333
349
// check_or_extract should return false
334
350
let result = schema. check_or_extract ( & mut obj, Some ( log_field) ) ;
335
- assert ! ( !result, "Should not extract fields from invalid log format" ) ;
351
+ assert ! (
352
+ result. is_none( ) ,
353
+ "Should not extract fields from invalid log format"
354
+ ) ;
336
355
337
356
// Verify no fields were added
338
357
assert ! ( !obj. contains_key( "ip" ) ) ;
@@ -343,11 +362,10 @@ mod tests {
343
362
fn test_no_pattern_missing_fields ( ) {
344
363
// Create a schema definition with no pattern
345
364
let schema = SchemaDefinition {
346
- patterns : vec ! [ ] ,
347
- field_mappings : vec ! [ HashSet :: from_iter( [
348
- "field1" . to_string( ) ,
349
- "field2" . to_string( ) ,
350
- ] ) ] ,
365
+ patterns : vec ! [ Pattern {
366
+ pattern: None ,
367
+ fields: HashSet :: from_iter( [ "field1" . to_string( ) , "field2" . to_string( ) ] ) ,
368
+ } ] ,
351
369
} ;
352
370
353
371
// Create an object missing the required fields
@@ -360,7 +378,7 @@ mod tests {
360
378
// check_or_extract should return false
361
379
let result = schema. check_or_extract ( & mut obj, Some ( "log" ) ) ;
362
380
assert ! (
363
- ! result,
381
+ result. is_none ( ) ,
364
382
"Should return false when no pattern and missing fields"
365
383
) ;
366
384
}
@@ -467,7 +485,10 @@ mod tests {
467
485
468
486
// check_or_extract should return false
469
487
let result = schema. check_or_extract ( & mut obj, Some ( "raw_log" ) ) ;
470
- assert ! ( !result, "Should return false when log field is missing" ) ;
488
+ assert ! (
489
+ result. is_none( ) ,
490
+ "Should return false when log field is missing"
491
+ ) ;
471
492
472
493
// Verify no fields were added
473
494
assert ! ( !obj. contains_key( "level" ) ) ;
0 commit comments