Skip to content

Commit d9ae1d7

Browse files
author
Ryan Senior
committed
(PDB-135) Improve and reduce the number of UTF-8 warnings
Prior to this commit, any non-ASCII character found in a catalog would be replaced by /ufffd and a warning would be issued saying it found an invalid character. The code now takes into account the force_encoding done by to_pson and attempts to force_encode it back to UTF-8. For most cases this should be sufficient. There are times when binary data can appear in a catalog. In this case, we have characters that we can't represent in UTF-8. For this we continue to give users a warning. This warning now includes the command being submitted and the node the command is for. If users have debug logging enabled, context is given around where this bad character data occurred. Up to 100 characters before/after this bad data is included in the Puppet debug log.
1 parent 4ec11fe commit d9ae1d7

File tree

4 files changed

+240
-35
lines changed

4 files changed

+240
-35
lines changed

puppet/lib/puppet/util/puppetdb/char_encoding.rb

Lines changed: 125 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,37 +31,141 @@ module CharEncoding
3131

3232
Utf8ReplacementChar = [ 0xEF, 0xBF, 0xBD ].pack("c*")
3333

34+
DEFAULT_INVALID_CHAR = "\ufffd"
3435

35-
def self.utf8_string(str)
36+
# @api private
37+
def self.all_indexes_of_char(str, char)
38+
(0..str.length).find_all{ |i| str[i] == char}
39+
end
40+
41+
# @api private
42+
#
43+
# Takes an array and returns a sub-array without the last element
44+
#
45+
# @return [Object]
46+
def self.drop_last(array)
47+
array[0..-2]
48+
end
49+
50+
# @api private
51+
#
52+
# Takes an array of increasing integers and collapses the sequential
53+
# integers into ranges
54+
#
55+
# @param index_array an array of sorted integers
56+
# @return [Range]
57+
def self.collapse_ranges(index_array)
58+
ranges = index_array.each.inject([]) do |spans, n|
59+
if spans.empty? || spans.last.end != n - 1
60+
spans << Range.new(n, n)
61+
else
62+
drop_last(spans) << Range.new(spans.last.begin,n)
63+
end
64+
end
65+
end
66+
67+
# @api private
68+
#
69+
# Scans the string s with bad characters found at bad_char_indexes
70+
# and returns an array of messages that give some context around the
71+
# bad characters. This will give up to 100 characters prior to the
72+
# bad character and 100 after. It will return fewer if it's at the
73+
# beginning of a string or if another bad character appears before
74+
# reaching the 100 characters
75+
#
76+
# @param str string coming from to_pson, likely a command to be submitted to PDB
77+
# @param bad_char_indexes an array of indexes into the string where invalid characters were found
78+
# @return [String]
79+
def self.error_char_context(str, bad_char_indexes)
80+
bad_char_ranges = collapse_ranges(bad_char_indexes)
81+
bad_char_ranges.each_with_index.inject([]) do |state, (r, index)|
82+
gap = r.to_a.length
83+
84+
prev_bad_char_end = bad_char_ranges[index-1].end + 1 if index > 0
85+
next_bad_char_begin = bad_char_ranges[index+1].begin - 1 if index < bad_char_ranges.length - 1
86+
87+
start_char = [prev_bad_char_end || 0, r.begin-100].max
88+
end_char = [next_bad_char_begin || str.length - 1, r.end+100].min
89+
x = [next_bad_char_begin || str.length, r.end+100, str.length]
90+
prefix = str[start_char..r.begin-1]
91+
suffix = str[r.end+1..end_char]
92+
93+
state << "'#{prefix}' followed by #{gap} invalid/undefined bytes then '#{suffix}'"
94+
end
95+
end
96+
97+
# @api private
98+
#
99+
# Warns the user if an invalid character was found. If debugging is
100+
# enabled will also log contextual information about where the bad
101+
# character(s) were found
102+
#
103+
# @param str A string coming from to_pson, likely a command to be submitted to PDB
104+
# @param error_context_str information about where this string came from for use in error messages
105+
# @return String
106+
def self.warn_if_invalid_chars(str, error_context_str)
107+
bad_char_indexes = all_indexes_of_char(str, DEFAULT_INVALID_CHAR)
108+
if bad_char_indexes.empty?
109+
str
110+
else
111+
Puppet.warning "#{error_context_str} ignoring invalid UTF-8 byte sequences in data to be sent to PuppetDB, see debug logging for more info"
112+
if Puppet.settings[:log_level] == "debug"
113+
Puppet.debug error_context_str + "\n" + error_char_context(str, bad_char_indexes).join("\n")
114+
end
115+
116+
str
117+
end
118+
end
119+
120+
# @api private
121+
#
122+
# Attempts to coerce str to UTF-8, if that fails will output context
123+
# information using error_context_str
124+
#
125+
# @param str A string coming from to_pson, likely a command to be submitted to PDB
126+
# @param error_context_str information about where this string came from for use in error messages
127+
# @return Str
128+
def self.coerce_to_utf8(str, error_context_str)
129+
str_copy = str.dup
130+
# This code is passed in a string that was created by
131+
# to_pson. to_pson calls force_encoding('ASCII-8BIT') on the
132+
# string before it returns it. This leaves the actual UTF-8 bytes
133+
# alone. Below we check to see if this is the case (this should be
134+
# most common). In this case, the bytes are still UTF-8 and we can
135+
# just encode! and we're good to go. If They are not valid UTF-8
136+
# bytes, that means there is probably some binary data mixed in
137+
# the middle of the UTF-8 string. In this case we need to output a
138+
# warning and give the user more information
139+
str_copy.force_encoding("UTF-8")
140+
if str_copy.valid_encoding?
141+
str_copy.encode!("UTF-8")
142+
else
143+
# This is force_encoded as US-ASCII to avoid any overlapping
144+
# byte related issues that could arise from mis-interpreting a
145+
# random extra byte as part of a multi-byte UTF-8 character
146+
str_copy.force_encoding("US-ASCII")
147+
warn_if_invalid_chars(str_copy.encode!("UTF-8",
148+
:invalid => :replace,
149+
:undef => :replace,
150+
:replace => DEFAULT_INVALID_CHAR),
151+
error_context_str)
152+
end
153+
end
154+
155+
def self.utf8_string(str, error_context_str)
36156
if RUBY_VERSION =~ /^1.8/
37157
# Ruby 1.8 doesn't have String#encode and related methods, and there
38158
# appears to be a bug in iconv that will interpret some byte sequences
39159
# as 6-byte characters. Thus, we are forced to resort to some unfortunate
40160
# manual chicanery.
41161
warn_if_changed(str, ruby18_clean_utf8(str))
42-
elsif str.encoding == Encoding::UTF_8
43-
# If we get here, we're in ruby 1.9+, so we have the string encoding methods
44-
# available. However, just because a ruby String object is already
45-
# marked as UTF-8, that doesn't guarantee that its contents are actually
46-
# valid; and if you call ruby's ".encode" method with an encoding of
47-
# "utf-8" for a String that ruby already believes is UTF-8, ruby
48-
# seems to optimize that to be a no-op. So, we have to do some more
49-
# complex handling...
50-
51-
# If the string already has valid encoding then we're fine.
52-
return str if str.valid_encoding?
53-
54-
# If not, we basically have to walk over the characters and replace
55-
# them by hand.
56-
warn_if_changed(str, str.each_char.map { |c| c.valid_encoding? ? c : "\ufffd"}.join)
57162
else
58-
# if we get here, we're ruby 1.9 and the current string is *not* encoded
59-
# as UTF-8. Thus we can actually rely on ruby's "encode" method.
60163
begin
61-
str.encode('UTF-8')
164+
coerce_to_utf8(str, error_context_str)
62165
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
63-
# If we got an exception, the string is either invalid or not
64-
# convertible to UTF-8, so drop those bytes.
166+
# If we got an exception, the string is either invalid or not
167+
# convertible to UTF-8, so drop those bytes.
168+
65169
warn_if_changed(str, str.encode('UTF-8', :invalid => :replace, :undef => :replace))
66170
end
67171
end

puppet/lib/puppet/util/puppetdb/command.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def initialize(command, version, certname, payload)
4242
#
4343
# This is roughly inline with how Puppet serializes for catalogs as of
4444
# Puppet 4.1.0. We need a better answer to non-utf8 data end-to-end.
45-
}.to_pson)
45+
}.to_pson, "Error encoding a '#{command}' command for host '#{certname}'")
4646
end
4747
end
4848

puppet/spec/unit/util/puppetdb/char_encoding_spec.rb

Lines changed: 79 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env rspec
2-
# encoding: UTF-8
2+
# encoding: utf-8
33

44
require 'spec_helper'
55

@@ -79,28 +79,28 @@ def test_utf8_clean(in_bytes, expected_bytes)
7979
Puppet.expects(:warning).never
8080

8181
str = "any ascii string"
82-
subject.utf8_string(str).should == str
82+
subject.utf8_string(str, nil).should == str
8383
end
8484

8585
it "should strip invalid chars from non-overlapping latin-1 with a warning" do
8686
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
8787

8888
str = "a latin-1 string \xd6"
89-
subject.utf8_string(str).should == "a latin-1 string "
89+
subject.utf8_string(str, nil).should == "a latin-1 string "
9090
end
9191

9292
it "should strip invalid chars and warn if the string is invalid UTF-8" do
9393
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
9494

9595
str = "an invalid utf-8 string \xff"
96-
subject.utf8_string(str).should == "an invalid utf-8 string "
96+
subject.utf8_string(str, nil).should == "an invalid utf-8 string "
9797
end
9898

9999
it "should return a valid utf-8 string without warning" do
100100
Puppet.expects(:warning).never
101101

102102
str = "a valid utf-8 string \xc3\x96"
103-
subject.utf8_string(str).should == str
103+
subject.utf8_string(str, nil).should == str
104104
end
105105
end
106106

@@ -109,40 +109,106 @@ def test_utf8_clean(in_bytes, expected_bytes)
109109
Puppet.expects(:warning).never
110110

111111
str = "any ascii string".force_encoding('us-ascii')
112-
subject.utf8_string(str).should == str
112+
subject.utf8_string(str, nil).should == str
113113
end
114114

115115
it "should convert from latin-1 without a warning" do
116116
Puppet.expects(:warning).never
117117

118-
str = "a latin-1 string \xd6".force_encoding('iso-8859-1')
119-
subject.utf8_string(str).should == "a latin-1 string Ö"
118+
str = "a latin-1 string Ö".force_encoding('ASCII-8BIT')
119+
subject.utf8_string(str, nil).should == "a latin-1 string Ö"
120120
end
121121

122122
# UndefinedConversionError
123123
it "should replace undefined characters and warn when converting from binary" do
124-
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
124+
Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}
125125

126126
str = "an invalid binary string \xff".force_encoding('binary')
127127
# \ufffd == unicode replacement character
128-
subject.utf8_string(str).should == "an invalid binary string \ufffd"
128+
subject.utf8_string(str, "Error with command").should == "an invalid binary string \ufffd"
129129
end
130130

131131
# InvalidByteSequenceError
132132
it "should replace undefined characters and warn if the string is invalid UTF-8" do
133-
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
133+
Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}
134134

135135
str = "an invalid utf-8 string \xff".force_encoding('utf-8')
136-
subject.utf8_string(str).should == "an invalid utf-8 string \ufffd"
136+
subject.utf8_string(str, "Error with command").should == "an invalid utf-8 string \ufffd"
137137
end
138138

139139
it "should leave the string alone if it's valid UTF-8" do
140140
Puppet.expects(:warning).never
141141

142142
str = "a valid utf-8 string".force_encoding('utf-8')
143-
subject.utf8_string(str).should == str
143+
subject.utf8_string(str, nil).should == str
144+
end
145+
146+
it "should leave the string alone if it's valid UTF-8 with non-ascii characters" do
147+
Puppet.expects(:warning).never
148+
149+
str = "a valid utf-8 string Ö"
150+
subject.utf8_string(str.dup.force_encoding('ASCII-8BIT'), nil).should == str
151+
end
152+
153+
describe "Debug log testing of bad data" do
154+
let!(:existing_log_level){ Puppet[:log_level]}
155+
156+
before :each do
157+
Puppet[:log_level] = "debug"
158+
end
159+
160+
after :each do
161+
Puppet[:log_level] = "notice"
162+
end
163+
164+
it "should emit a warning and debug messages when bad characters are found" do
165+
Puppet[:log_level] = "debug"
166+
Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'replace facts' command for host 'foo.com' ignoring invalid/}
167+
Puppet.expects(:debug).with do |msg|
168+
msg =~ /Error encoding a 'replace facts' command for host 'foo.com'/ &&
169+
msg =~ /'some valid string' followed by 1 invalid\/undefined bytes then ''/
170+
end
171+
172+
# This will create a UTF-8 string literal, then switch to ASCII-8Bit when the bad
173+
# bytes are concated on below
174+
str = "some valid string" << [192].pack('c*')
175+
subject.utf8_string(str, "Error encoding a 'replace facts' command for host 'foo.com'").should == "some valid string\ufffd"
176+
end
177+
end
178+
179+
it "should emit a warning and no debug messages" do
180+
Puppet.expects(:warning).with {|msg| msg =~ /Error on replace catalog ignoring invalid UTF-8 byte sequences/}
181+
Puppet.expects(:debug).never
182+
str = "some valid string" << [192].pack('c*')
183+
subject.utf8_string(str, "Error on replace catalog").should == "some valid string\ufffd"
144184
end
145185
end
146186
end
147187

188+
describe "#all_indexes_of_char" do
189+
described_class.all_indexes_of_char("a\u2192b\u2192c\u2192d\u2192", "\u2192").should == [1, 3, 5, 7]
190+
described_class.all_indexes_of_char("abcd", "\u2192").should == []
191+
end
192+
193+
describe "#collapse_ranges" do
194+
described_class.collapse_ranges((1..5).to_a).should == [1..5]
195+
described_class.collapse_ranges([]).should == []
196+
described_class.collapse_ranges([1,2,3,5,7,8,9]).should == [1..3, 5..5, 7..9]
197+
end
198+
199+
describe "#error_char_context" do
200+
described_class.error_char_context("abc\ufffddef", [3]).should ==
201+
["'abc' followed by 1 invalid/undefined bytes then 'def'"]
202+
203+
described_class.error_char_context("abc\ufffd\ufffd\ufffd\ufffddef", [3,4,5,6]).should ==
204+
["'abc' followed by 4 invalid/undefined bytes then 'def'"]
205+
206+
described_class.error_char_context("abc\ufffddef\ufffdg", [3, 7]).should ==
207+
["'abc' followed by 1 invalid/undefined bytes then 'def'",
208+
"'def' followed by 1 invalid/undefined bytes then 'g'"]
209+
end
210+
211+
describe "#warn_if_invalid_chars" do
212+
213+
end
148214
end

puppet/spec/unit/util/puppetdb/command_spec.rb

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,42 @@
5454
subject.submit
5555
end
5656
end
57+
end
58+
59+
it "should not warn when the the string contains valid UTF-8 characters" do
60+
Puppet.expects(:warning).never
61+
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => "\u2192"})
62+
cmd.payload.include?("\u2192").should be_true
63+
end
5764

65+
it "should warn when a command payload includes non-ascii UTF-8 characters" do
66+
Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/}
67+
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
68+
cmd.payload.include?("\ufffd").should be_true
5869
end
5970

71+
describe "Debug log testing of bad data" do
72+
let!(:existing_log_level){ Puppet[:log_level]}
73+
74+
before :each do
75+
Puppet[:log_level] = "debug"
76+
end
77+
78+
after :each do
79+
Puppet[:log_level] = "notice"
80+
end
81+
82+
it "should warn when a command payload includes non-ascii UTF-8 characters" do
83+
Puppet.expects(:warning).with do |msg|
84+
msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/
85+
end
86+
Puppet.expects(:debug).with do |msg|
87+
msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain'/ &&
88+
msg =~ Regexp.new(Regexp.quote('"command":"command-1","version":1,"payload":{"foo"')) &&
89+
msg =~ /1 invalid\/undefined/
90+
end
91+
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
92+
cmd.payload.include?("\ufffd").should be_true
93+
end
94+
end
6095
end

0 commit comments

Comments
 (0)