@@ -31,37 +31,141 @@ module CharEncoding
31
31
32
32
Utf8ReplacementChar = [ 0xEF , 0xBF , 0xBD ] . pack ( "c*" )
33
33
34
+ DEFAULT_INVALID_CHAR = "\ufffd "
34
35
35
- def self . utf8_string ( str )
36
+ # @api private
37
+ def self . all_indexes_of_char ( str , char )
38
+ ( 0 ..str . length ) . find_all { |i | str [ i ] == char }
39
+ end
40
+
41
+ # @api private
42
+ #
43
+ # Takes an array and returns a sub-array without the last element
44
+ #
45
+ # @return [Object]
46
+ def self . drop_last ( array )
47
+ array [ 0 ..-2 ]
48
+ end
49
+
50
+ # @api private
51
+ #
52
+ # Takes an array of increasing integers and collapses the sequential
53
+ # integers into ranges
54
+ #
55
+ # @param index_array an array of sorted integers
56
+ # @return [Range]
57
+ def self . collapse_ranges ( index_array )
58
+ ranges = index_array . each . inject ( [ ] ) do |spans , n |
59
+ if spans . empty? || spans . last . end != n - 1
60
+ spans << Range . new ( n , n )
61
+ else
62
+ drop_last ( spans ) << Range . new ( spans . last . begin , n )
63
+ end
64
+ end
65
+ end
66
+
67
+ # @api private
68
+ #
69
+ # Scans the string s with bad characters found at bad_char_indexes
70
+ # and returns an array of messages that give some context around the
71
+ # bad characters. This will give up to 100 characters prior to the
72
+ # bad character and 100 after. It will return fewer if it's at the
73
+ # beginning of a string or if another bad character appears before
74
+ # reaching the 100 characters
75
+ #
76
+ # @param str string coming from to_pson, likely a command to be submitted to PDB
77
+ # @param bad_char_indexes an array of indexes into the string where invalid characters were found
78
+ # @return [String]
79
+ def self . error_char_context ( str , bad_char_indexes )
80
+ bad_char_ranges = collapse_ranges ( bad_char_indexes )
81
+ bad_char_ranges . each_with_index . inject ( [ ] ) do |state , ( r , index ) |
82
+ gap = r . to_a . length
83
+
84
+ prev_bad_char_end = bad_char_ranges [ index -1 ] . end + 1 if index > 0
85
+ next_bad_char_begin = bad_char_ranges [ index +1 ] . begin - 1 if index < bad_char_ranges . length - 1
86
+
87
+ start_char = [ prev_bad_char_end || 0 , r . begin -100 ] . max
88
+ end_char = [ next_bad_char_begin || str . length - 1 , r . end +100 ] . min
89
+ x = [ next_bad_char_begin || str . length , r . end +100 , str . length ]
90
+ prefix = str [ start_char ..r . begin -1 ]
91
+ suffix = str [ r . end +1 ..end_char ]
92
+
93
+ state << "'#{ prefix } ' followed by #{ gap } invalid/undefined bytes then '#{ suffix } '"
94
+ end
95
+ end
96
+
97
+ # @api private
98
+ #
99
+ # Warns the user if an invalid character was found. If debugging is
100
+ # enabled will also log contextual information about where the bad
101
+ # character(s) were found
102
+ #
103
+ # @param str A string coming from to_pson, likely a command to be submitted to PDB
104
+ # @param error_context_str information about where this string came from for use in error messages
105
+ # @return String
106
+ def self . warn_if_invalid_chars ( str , error_context_str )
107
+ bad_char_indexes = all_indexes_of_char ( str , DEFAULT_INVALID_CHAR )
108
+ if bad_char_indexes . empty?
109
+ str
110
+ else
111
+ Puppet . warning "#{ error_context_str } ignoring invalid UTF-8 byte sequences in data to be sent to PuppetDB, see debug logging for more info"
112
+ if Puppet . settings [ :log_level ] == "debug"
113
+ Puppet . debug error_context_str + "\n " + error_char_context ( str , bad_char_indexes ) . join ( "\n " )
114
+ end
115
+
116
+ str
117
+ end
118
+ end
119
+
120
+ # @api private
121
+ #
122
+ # Attempts to coerce str to UTF-8, if that fails will output context
123
+ # information using error_context_str
124
+ #
125
+ # @param str A string coming from to_pson, likely a command to be submitted to PDB
126
+ # @param error_context_str information about where this string came from for use in error messages
127
+ # @return Str
128
+ def self . coerce_to_utf8 ( str , error_context_str )
129
+ str_copy = str . dup
130
+ # This code is passed in a string that was created by
131
+ # to_pson. to_pson calls force_encoding('ASCII-8BIT') on the
132
+ # string before it returns it. This leaves the actual UTF-8 bytes
133
+ # alone. Below we check to see if this is the case (this should be
134
+ # most common). In this case, the bytes are still UTF-8 and we can
135
+ # just encode! and we're good to go. If They are not valid UTF-8
136
+ # bytes, that means there is probably some binary data mixed in
137
+ # the middle of the UTF-8 string. In this case we need to output a
138
+ # warning and give the user more information
139
+ str_copy . force_encoding ( "UTF-8" )
140
+ if str_copy . valid_encoding?
141
+ str_copy . encode! ( "UTF-8" )
142
+ else
143
+ # This is force_encoded as US-ASCII to avoid any overlapping
144
+ # byte related issues that could arise from mis-interpreting a
145
+ # random extra byte as part of a multi-byte UTF-8 character
146
+ str_copy . force_encoding ( "US-ASCII" )
147
+ warn_if_invalid_chars ( str_copy . encode! ( "UTF-8" ,
148
+ :invalid => :replace ,
149
+ :undef => :replace ,
150
+ :replace => DEFAULT_INVALID_CHAR ) ,
151
+ error_context_str )
152
+ end
153
+ end
154
+
155
+ def self . utf8_string ( str , error_context_str )
36
156
if RUBY_VERSION =~ /^1.8/
37
157
# Ruby 1.8 doesn't have String#encode and related methods, and there
38
158
# appears to be a bug in iconv that will interpret some byte sequences
39
159
# as 6-byte characters. Thus, we are forced to resort to some unfortunate
40
160
# manual chicanery.
41
161
warn_if_changed ( str , ruby18_clean_utf8 ( str ) )
42
- elsif str . encoding == Encoding ::UTF_8
43
- # If we get here, we're in ruby 1.9+, so we have the string encoding methods
44
- # available. However, just because a ruby String object is already
45
- # marked as UTF-8, that doesn't guarantee that its contents are actually
46
- # valid; and if you call ruby's ".encode" method with an encoding of
47
- # "utf-8" for a String that ruby already believes is UTF-8, ruby
48
- # seems to optimize that to be a no-op. So, we have to do some more
49
- # complex handling...
50
-
51
- # If the string already has valid encoding then we're fine.
52
- return str if str . valid_encoding?
53
-
54
- # If not, we basically have to walk over the characters and replace
55
- # them by hand.
56
- warn_if_changed ( str , str . each_char . map { |c | c . valid_encoding? ? c : "\ufffd " } . join )
57
162
else
58
- # if we get here, we're ruby 1.9 and the current string is *not* encoded
59
- # as UTF-8. Thus we can actually rely on ruby's "encode" method.
60
163
begin
61
- str . encode ( 'UTF-8' )
164
+ coerce_to_utf8 ( str , error_context_str )
62
165
rescue Encoding ::InvalidByteSequenceError , Encoding ::UndefinedConversionError => e
63
- # If we got an exception, the string is either invalid or not
64
- # convertible to UTF-8, so drop those bytes.
166
+ # If we got an exception, the string is either invalid or not
167
+ # convertible to UTF-8, so drop those bytes.
168
+
65
169
warn_if_changed ( str , str . encode ( 'UTF-8' , :invalid => :replace , :undef => :replace ) )
66
170
end
67
171
end
0 commit comments