Skip to content

Commit cc516ba

Browse files
committed
Merge pull request #1640 from senior/ticket/master/pdb-135-better-utf8-warning
(PDB-135) Improve and reduce the number of UTF-8 warnings
2 parents 4ec11fe + d9ae1d7 commit cc516ba

File tree

4 files changed

+240
-35
lines changed

4 files changed

+240
-35
lines changed

puppet/lib/puppet/util/puppetdb/char_encoding.rb

Lines changed: 125 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,37 +31,141 @@ module CharEncoding
3131

3232
Utf8ReplacementChar = [ 0xEF, 0xBF, 0xBD ].pack("c*")
3333

34+
DEFAULT_INVALID_CHAR = "\ufffd"
3435

35-
def self.utf8_string(str)
36+
# @api private
37+
def self.all_indexes_of_char(str, char)
38+
(0..str.length).find_all{ |i| str[i] == char}
39+
end
40+
41+
# @api private
42+
#
43+
# Takes an array and returns a sub-array without the last element
44+
#
45+
# @return [Object]
46+
def self.drop_last(array)
47+
array[0..-2]
48+
end
49+
50+
# @api private
51+
#
52+
# Takes an array of increasing integers and collapses the sequential
53+
# integers into ranges
54+
#
55+
# @param index_array an array of sorted integers
56+
# @return [Range]
57+
def self.collapse_ranges(index_array)
58+
ranges = index_array.each.inject([]) do |spans, n|
59+
if spans.empty? || spans.last.end != n - 1
60+
spans << Range.new(n, n)
61+
else
62+
drop_last(spans) << Range.new(spans.last.begin,n)
63+
end
64+
end
65+
end
66+
67+
# @api private
68+
#
69+
# Scans the string s with bad characters found at bad_char_indexes
70+
# and returns an array of messages that give some context around the
71+
# bad characters. This will give up to 100 characters prior to the
72+
# bad character and 100 after. It will return fewer if it's at the
73+
# beginning of a string or if another bad character appears before
74+
# reaching the 100 characters
75+
#
76+
# @param str string coming from to_pson, likely a command to be submitted to PDB
77+
# @param bad_char_indexes an array of indexes into the string where invalid characters were found
78+
# @return [String]
79+
def self.error_char_context(str, bad_char_indexes)
80+
bad_char_ranges = collapse_ranges(bad_char_indexes)
81+
bad_char_ranges.each_with_index.inject([]) do |state, (r, index)|
82+
gap = r.to_a.length
83+
84+
prev_bad_char_end = bad_char_ranges[index-1].end + 1 if index > 0
85+
next_bad_char_begin = bad_char_ranges[index+1].begin - 1 if index < bad_char_ranges.length - 1
86+
87+
start_char = [prev_bad_char_end || 0, r.begin-100].max
88+
end_char = [next_bad_char_begin || str.length - 1, r.end+100].min
89+
x = [next_bad_char_begin || str.length, r.end+100, str.length]
90+
prefix = str[start_char..r.begin-1]
91+
suffix = str[r.end+1..end_char]
92+
93+
state << "'#{prefix}' followed by #{gap} invalid/undefined bytes then '#{suffix}'"
94+
end
95+
end
96+
97+
# @api private
98+
#
99+
# Warns the user if an invalid character was found. If debugging is
100+
# enabled will also log contextual information about where the bad
101+
# character(s) were found
102+
#
103+
# @param str A string coming from to_pson, likely a command to be submitted to PDB
104+
# @param error_context_str information about where this string came from for use in error messages
105+
# @return String
106+
def self.warn_if_invalid_chars(str, error_context_str)
107+
bad_char_indexes = all_indexes_of_char(str, DEFAULT_INVALID_CHAR)
108+
if bad_char_indexes.empty?
109+
str
110+
else
111+
Puppet.warning "#{error_context_str} ignoring invalid UTF-8 byte sequences in data to be sent to PuppetDB, see debug logging for more info"
112+
if Puppet.settings[:log_level] == "debug"
113+
Puppet.debug error_context_str + "\n" + error_char_context(str, bad_char_indexes).join("\n")
114+
end
115+
116+
str
117+
end
118+
end
119+
120+
# @api private
121+
#
122+
# Attempts to coerce str to UTF-8, if that fails will output context
123+
# information using error_context_str
124+
#
125+
# @param str A string coming from to_pson, likely a command to be submitted to PDB
126+
# @param error_context_str information about where this string came from for use in error messages
127+
# @return Str
128+
def self.coerce_to_utf8(str, error_context_str)
129+
str_copy = str.dup
130+
# This code is passed in a string that was created by
131+
# to_pson. to_pson calls force_encoding('ASCII-8BIT') on the
132+
# string before it returns it. This leaves the actual UTF-8 bytes
133+
# alone. Below we check to see if this is the case (this should be
134+
# most common). In this case, the bytes are still UTF-8 and we can
135+
# just encode! and we're good to go. If They are not valid UTF-8
136+
# bytes, that means there is probably some binary data mixed in
137+
# the middle of the UTF-8 string. In this case we need to output a
138+
# warning and give the user more information
139+
str_copy.force_encoding("UTF-8")
140+
if str_copy.valid_encoding?
141+
str_copy.encode!("UTF-8")
142+
else
143+
# This is force_encoded as US-ASCII to avoid any overlapping
144+
# byte related issues that could arise from mis-interpreting a
145+
# random extra byte as part of a multi-byte UTF-8 character
146+
str_copy.force_encoding("US-ASCII")
147+
warn_if_invalid_chars(str_copy.encode!("UTF-8",
148+
:invalid => :replace,
149+
:undef => :replace,
150+
:replace => DEFAULT_INVALID_CHAR),
151+
error_context_str)
152+
end
153+
end
154+
155+
def self.utf8_string(str, error_context_str)
36156
if RUBY_VERSION =~ /^1.8/
37157
# Ruby 1.8 doesn't have String#encode and related methods, and there
38158
# appears to be a bug in iconv that will interpret some byte sequences
39159
# as 6-byte characters. Thus, we are forced to resort to some unfortunate
40160
# manual chicanery.
41161
warn_if_changed(str, ruby18_clean_utf8(str))
42-
elsif str.encoding == Encoding::UTF_8
43-
# If we get here, we're in ruby 1.9+, so we have the string encoding methods
44-
# available. However, just because a ruby String object is already
45-
# marked as UTF-8, that doesn't guarantee that its contents are actually
46-
# valid; and if you call ruby's ".encode" method with an encoding of
47-
# "utf-8" for a String that ruby already believes is UTF-8, ruby
48-
# seems to optimize that to be a no-op. So, we have to do some more
49-
# complex handling...
50-
51-
# If the string already has valid encoding then we're fine.
52-
return str if str.valid_encoding?
53-
54-
# If not, we basically have to walk over the characters and replace
55-
# them by hand.
56-
warn_if_changed(str, str.each_char.map { |c| c.valid_encoding? ? c : "\ufffd"}.join)
57162
else
58-
# if we get here, we're ruby 1.9 and the current string is *not* encoded
59-
# as UTF-8. Thus we can actually rely on ruby's "encode" method.
60163
begin
61-
str.encode('UTF-8')
164+
coerce_to_utf8(str, error_context_str)
62165
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
63-
# If we got an exception, the string is either invalid or not
64-
# convertible to UTF-8, so drop those bytes.
166+
# If we got an exception, the string is either invalid or not
167+
# convertible to UTF-8, so drop those bytes.
168+
65169
warn_if_changed(str, str.encode('UTF-8', :invalid => :replace, :undef => :replace))
66170
end
67171
end

puppet/lib/puppet/util/puppetdb/command.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def initialize(command, version, certname, payload)
4242
#
4343
# This is roughly inline with how Puppet serializes for catalogs as of
4444
# Puppet 4.1.0. We need a better answer to non-utf8 data end-to-end.
45-
}.to_pson)
45+
}.to_pson, "Error encoding a '#{command}' command for host '#{certname}'")
4646
end
4747
end
4848

puppet/spec/unit/util/puppetdb/char_encoding_spec.rb

Lines changed: 79 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env rspec
2-
# encoding: UTF-8
2+
# encoding: utf-8
33

44
require 'spec_helper'
55

@@ -79,28 +79,28 @@ def test_utf8_clean(in_bytes, expected_bytes)
7979
Puppet.expects(:warning).never
8080

8181
str = "any ascii string"
82-
subject.utf8_string(str).should == str
82+
subject.utf8_string(str, nil).should == str
8383
end
8484

8585
it "should strip invalid chars from non-overlapping latin-1 with a warning" do
8686
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
8787

8888
str = "a latin-1 string \xd6"
89-
subject.utf8_string(str).should == "a latin-1 string "
89+
subject.utf8_string(str, nil).should == "a latin-1 string "
9090
end
9191

9292
it "should strip invalid chars and warn if the string is invalid UTF-8" do
9393
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
9494

9595
str = "an invalid utf-8 string \xff"
96-
subject.utf8_string(str).should == "an invalid utf-8 string "
96+
subject.utf8_string(str, nil).should == "an invalid utf-8 string "
9797
end
9898

9999
it "should return a valid utf-8 string without warning" do
100100
Puppet.expects(:warning).never
101101

102102
str = "a valid utf-8 string \xc3\x96"
103-
subject.utf8_string(str).should == str
103+
subject.utf8_string(str, nil).should == str
104104
end
105105
end
106106

@@ -109,40 +109,106 @@ def test_utf8_clean(in_bytes, expected_bytes)
109109
Puppet.expects(:warning).never
110110

111111
str = "any ascii string".force_encoding('us-ascii')
112-
subject.utf8_string(str).should == str
112+
subject.utf8_string(str, nil).should == str
113113
end
114114

115115
it "should convert from latin-1 without a warning" do
116116
Puppet.expects(:warning).never
117117

118-
str = "a latin-1 string \xd6".force_encoding('iso-8859-1')
119-
subject.utf8_string(str).should == "a latin-1 string Ö"
118+
str = "a latin-1 string Ö".force_encoding('ASCII-8BIT')
119+
subject.utf8_string(str, nil).should == "a latin-1 string Ö"
120120
end
121121

122122
# UndefinedConversionError
123123
it "should replace undefined characters and warn when converting from binary" do
124-
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
124+
Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}
125125

126126
str = "an invalid binary string \xff".force_encoding('binary')
127127
# \ufffd == unicode replacement character
128-
subject.utf8_string(str).should == "an invalid binary string \ufffd"
128+
subject.utf8_string(str, "Error with command").should == "an invalid binary string \ufffd"
129129
end
130130

131131
# InvalidByteSequenceError
132132
it "should replace undefined characters and warn if the string is invalid UTF-8" do
133-
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
133+
Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}
134134

135135
str = "an invalid utf-8 string \xff".force_encoding('utf-8')
136-
subject.utf8_string(str).should == "an invalid utf-8 string \ufffd"
136+
subject.utf8_string(str, "Error with command").should == "an invalid utf-8 string \ufffd"
137137
end
138138

139139
it "should leave the string alone if it's valid UTF-8" do
140140
Puppet.expects(:warning).never
141141

142142
str = "a valid utf-8 string".force_encoding('utf-8')
143-
subject.utf8_string(str).should == str
143+
subject.utf8_string(str, nil).should == str
144+
end
145+
146+
it "should leave the string alone if it's valid UTF-8 with non-ascii characters" do
147+
Puppet.expects(:warning).never
148+
149+
str = "a valid utf-8 string Ö"
150+
subject.utf8_string(str.dup.force_encoding('ASCII-8BIT'), nil).should == str
151+
end
152+
153+
describe "Debug log testing of bad data" do
154+
let!(:existing_log_level){ Puppet[:log_level]}
155+
156+
before :each do
157+
Puppet[:log_level] = "debug"
158+
end
159+
160+
after :each do
161+
Puppet[:log_level] = "notice"
162+
end
163+
164+
it "should emit a warning and debug messages when bad characters are found" do
165+
Puppet[:log_level] = "debug"
166+
Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'replace facts' command for host 'foo.com' ignoring invalid/}
167+
Puppet.expects(:debug).with do |msg|
168+
msg =~ /Error encoding a 'replace facts' command for host 'foo.com'/ &&
169+
msg =~ /'some valid string' followed by 1 invalid\/undefined bytes then ''/
170+
end
171+
172+
# This will create a UTF-8 string literal, then switch to ASCII-8Bit when the bad
173+
# bytes are concated on below
174+
str = "some valid string" << [192].pack('c*')
175+
subject.utf8_string(str, "Error encoding a 'replace facts' command for host 'foo.com'").should == "some valid string\ufffd"
176+
end
177+
end
178+
179+
it "should emit a warning and no debug messages" do
180+
Puppet.expects(:warning).with {|msg| msg =~ /Error on replace catalog ignoring invalid UTF-8 byte sequences/}
181+
Puppet.expects(:debug).never
182+
str = "some valid string" << [192].pack('c*')
183+
subject.utf8_string(str, "Error on replace catalog").should == "some valid string\ufffd"
144184
end
145185
end
146186
end
147187

188+
describe "#all_indexes_of_char" do
189+
described_class.all_indexes_of_char("a\u2192b\u2192c\u2192d\u2192", "\u2192").should == [1, 3, 5, 7]
190+
described_class.all_indexes_of_char("abcd", "\u2192").should == []
191+
end
192+
193+
describe "#collapse_ranges" do
194+
described_class.collapse_ranges((1..5).to_a).should == [1..5]
195+
described_class.collapse_ranges([]).should == []
196+
described_class.collapse_ranges([1,2,3,5,7,8,9]).should == [1..3, 5..5, 7..9]
197+
end
198+
199+
describe "#error_char_context" do
200+
described_class.error_char_context("abc\ufffddef", [3]).should ==
201+
["'abc' followed by 1 invalid/undefined bytes then 'def'"]
202+
203+
described_class.error_char_context("abc\ufffd\ufffd\ufffd\ufffddef", [3,4,5,6]).should ==
204+
["'abc' followed by 4 invalid/undefined bytes then 'def'"]
205+
206+
described_class.error_char_context("abc\ufffddef\ufffdg", [3, 7]).should ==
207+
["'abc' followed by 1 invalid/undefined bytes then 'def'",
208+
"'def' followed by 1 invalid/undefined bytes then 'g'"]
209+
end
210+
211+
describe "#warn_if_invalid_chars" do
212+
213+
end
148214
end

puppet/spec/unit/util/puppetdb/command_spec.rb

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,42 @@
5454
subject.submit
5555
end
5656
end
57+
end
58+
59+
it "should not warn when the the string contains valid UTF-8 characters" do
60+
Puppet.expects(:warning).never
61+
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => "\u2192"})
62+
cmd.payload.include?("\u2192").should be_true
63+
end
5764

65+
it "should warn when a command payload includes non-ascii UTF-8 characters" do
66+
Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/}
67+
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
68+
cmd.payload.include?("\ufffd").should be_true
5869
end
5970

71+
describe "Debug log testing of bad data" do
72+
let!(:existing_log_level){ Puppet[:log_level]}
73+
74+
before :each do
75+
Puppet[:log_level] = "debug"
76+
end
77+
78+
after :each do
79+
Puppet[:log_level] = "notice"
80+
end
81+
82+
it "should warn when a command payload includes non-ascii UTF-8 characters" do
83+
Puppet.expects(:warning).with do |msg|
84+
msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/
85+
end
86+
Puppet.expects(:debug).with do |msg|
87+
msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain'/ &&
88+
msg =~ Regexp.new(Regexp.quote('"command":"command-1","version":1,"payload":{"foo"')) &&
89+
msg =~ /1 invalid\/undefined/
90+
end
91+
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
92+
cmd.payload.include?("\ufffd").should be_true
93+
end
94+
end
6095
end

0 commit comments

Comments
 (0)