diff --git a/puppet/lib/puppet/util/puppetdb/char_encoding.rb b/puppet/lib/puppet/util/puppetdb/char_encoding.rb index 5e1ffc8165..d66d630157 100644 --- a/puppet/lib/puppet/util/puppetdb/char_encoding.rb +++ b/puppet/lib/puppet/util/puppetdb/char_encoding.rb @@ -31,37 +31,141 @@ module CharEncoding Utf8ReplacementChar = [ 0xEF, 0xBF, 0xBD ].pack("c*") + DEFAULT_INVALID_CHAR = "\ufffd" - def self.utf8_string(str) + # @api private + def self.all_indexes_of_char(str, char) + (0..str.length).find_all{ |i| str[i] == char} + end + + # @api private + # + # Takes an array and returns a sub-array without the last element + # + # @return [Object] + def self.drop_last(array) + array[0..-2] + end + + # @api private + # + # Takes an array of increasing integers and collapses the sequential + # integers into ranges + # + # @param index_array an array of sorted integers + # @return [Range] + def self.collapse_ranges(index_array) + ranges = index_array.each.inject([]) do |spans, n| + if spans.empty? || spans.last.end != n - 1 + spans << Range.new(n, n) + else + drop_last(spans) << Range.new(spans.last.begin,n) + end + end + end + + # @api private + # + # Scans the string s with bad characters found at bad_char_indexes + # and returns an array of messages that give some context around the + # bad characters. This will give up to 100 characters prior to the + # bad character and 100 after. It will return fewer if it's at the + # beginning of a string or if another bad character appears before + # reaching the 100 characters + # + # @param str string coming from to_pson, likely a command to be submitted to PDB + # @param bad_char_indexes an array of indexes into the string where invalid characters were found + # @return [String] + def self.error_char_context(str, bad_char_indexes) + bad_char_ranges = collapse_ranges(bad_char_indexes) + bad_char_ranges.each_with_index.inject([]) do |state, (r, index)| + gap = r.to_a.length + + prev_bad_char_end = bad_char_ranges[index-1].end + 1 if index > 0 + next_bad_char_begin = bad_char_ranges[index+1].begin - 1 if index < bad_char_ranges.length - 1 + + start_char = [prev_bad_char_end || 0, r.begin-100].max + end_char = [next_bad_char_begin || str.length - 1, r.end+100].min + x = [next_bad_char_begin || str.length, r.end+100, str.length] + prefix = str[start_char..r.begin-1] + suffix = str[r.end+1..end_char] + + state << "'#{prefix}' followed by #{gap} invalid/undefined bytes then '#{suffix}'" + end + end + + # @api private + # + # Warns the user if an invalid character was found. If debugging is + # enabled will also log contextual information about where the bad + # character(s) were found + # + # @param str A string coming from to_pson, likely a command to be submitted to PDB + # @param error_context_str information about where this string came from for use in error messages + # @return String + def self.warn_if_invalid_chars(str, error_context_str) + bad_char_indexes = all_indexes_of_char(str, DEFAULT_INVALID_CHAR) + if bad_char_indexes.empty? + str + else + Puppet.warning "#{error_context_str} ignoring invalid UTF-8 byte sequences in data to be sent to PuppetDB, see debug logging for more info" + if Puppet.settings[:log_level] == "debug" + Puppet.debug error_context_str + "\n" + error_char_context(str, bad_char_indexes).join("\n") + end + + str + end + end + + # @api private + # + # Attempts to coerce str to UTF-8, if that fails will output context + # information using error_context_str + # + # @param str A string coming from to_pson, likely a command to be submitted to PDB + # @param error_context_str information about where this string came from for use in error messages + # @return Str + def self.coerce_to_utf8(str, error_context_str) + str_copy = str.dup + # This code is passed in a string that was created by + # to_pson. to_pson calls force_encoding('ASCII-8BIT') on the + # string before it returns it. This leaves the actual UTF-8 bytes + # alone. Below we check to see if this is the case (this should be + # most common). In this case, the bytes are still UTF-8 and we can + # just encode! and we're good to go. If They are not valid UTF-8 + # bytes, that means there is probably some binary data mixed in + # the middle of the UTF-8 string. In this case we need to output a + # warning and give the user more information + str_copy.force_encoding("UTF-8") + if str_copy.valid_encoding? + str_copy.encode!("UTF-8") + else + # This is force_encoded as US-ASCII to avoid any overlapping + # byte related issues that could arise from mis-interpreting a + # random extra byte as part of a multi-byte UTF-8 character + str_copy.force_encoding("US-ASCII") + warn_if_invalid_chars(str_copy.encode!("UTF-8", + :invalid => :replace, + :undef => :replace, + :replace => DEFAULT_INVALID_CHAR), + error_context_str) + end + end + + def self.utf8_string(str, error_context_str) if RUBY_VERSION =~ /^1.8/ # Ruby 1.8 doesn't have String#encode and related methods, and there # appears to be a bug in iconv that will interpret some byte sequences # as 6-byte characters. Thus, we are forced to resort to some unfortunate # manual chicanery. warn_if_changed(str, ruby18_clean_utf8(str)) - elsif str.encoding == Encoding::UTF_8 - # If we get here, we're in ruby 1.9+, so we have the string encoding methods - # available. However, just because a ruby String object is already - # marked as UTF-8, that doesn't guarantee that its contents are actually - # valid; and if you call ruby's ".encode" method with an encoding of - # "utf-8" for a String that ruby already believes is UTF-8, ruby - # seems to optimize that to be a no-op. So, we have to do some more - # complex handling... - - # If the string already has valid encoding then we're fine. - return str if str.valid_encoding? - - # If not, we basically have to walk over the characters and replace - # them by hand. - warn_if_changed(str, str.each_char.map { |c| c.valid_encoding? ? c : "\ufffd"}.join) else - # if we get here, we're ruby 1.9 and the current string is *not* encoded - # as UTF-8. Thus we can actually rely on ruby's "encode" method. begin - str.encode('UTF-8') + coerce_to_utf8(str, error_context_str) rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e - # If we got an exception, the string is either invalid or not - # convertible to UTF-8, so drop those bytes. + # If we got an exception, the string is either invalid or not + # convertible to UTF-8, so drop those bytes. + warn_if_changed(str, str.encode('UTF-8', :invalid => :replace, :undef => :replace)) end end diff --git a/puppet/lib/puppet/util/puppetdb/command.rb b/puppet/lib/puppet/util/puppetdb/command.rb index e8278c8387..5ec8b3dd73 100644 --- a/puppet/lib/puppet/util/puppetdb/command.rb +++ b/puppet/lib/puppet/util/puppetdb/command.rb @@ -42,7 +42,7 @@ def initialize(command, version, certname, payload) # # This is roughly inline with how Puppet serializes for catalogs as of # Puppet 4.1.0. We need a better answer to non-utf8 data end-to-end. - }.to_pson) + }.to_pson, "Error encoding a '#{command}' command for host '#{certname}'") end end diff --git a/puppet/spec/unit/util/puppetdb/char_encoding_spec.rb b/puppet/spec/unit/util/puppetdb/char_encoding_spec.rb index 0d7d56587d..230f5751d8 100644 --- a/puppet/spec/unit/util/puppetdb/char_encoding_spec.rb +++ b/puppet/spec/unit/util/puppetdb/char_encoding_spec.rb @@ -1,5 +1,5 @@ #!/usr/bin/env rspec -# encoding: UTF-8 +# encoding: utf-8 require 'spec_helper' @@ -79,28 +79,28 @@ def test_utf8_clean(in_bytes, expected_bytes) Puppet.expects(:warning).never str = "any ascii string" - subject.utf8_string(str).should == str + subject.utf8_string(str, nil).should == str end it "should strip invalid chars from non-overlapping latin-1 with a warning" do Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/} str = "a latin-1 string \xd6" - subject.utf8_string(str).should == "a latin-1 string " + subject.utf8_string(str, nil).should == "a latin-1 string " end it "should strip invalid chars and warn if the string is invalid UTF-8" do Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/} str = "an invalid utf-8 string \xff" - subject.utf8_string(str).should == "an invalid utf-8 string " + subject.utf8_string(str, nil).should == "an invalid utf-8 string " end it "should return a valid utf-8 string without warning" do Puppet.expects(:warning).never str = "a valid utf-8 string \xc3\x96" - subject.utf8_string(str).should == str + subject.utf8_string(str, nil).should == str end end @@ -109,40 +109,106 @@ def test_utf8_clean(in_bytes, expected_bytes) Puppet.expects(:warning).never str = "any ascii string".force_encoding('us-ascii') - subject.utf8_string(str).should == str + subject.utf8_string(str, nil).should == str end it "should convert from latin-1 without a warning" do Puppet.expects(:warning).never - str = "a latin-1 string \xd6".force_encoding('iso-8859-1') - subject.utf8_string(str).should == "a latin-1 string Ö" + str = "a latin-1 string Ö".force_encoding('ASCII-8BIT') + subject.utf8_string(str, nil).should == "a latin-1 string Ö" end # UndefinedConversionError it "should replace undefined characters and warn when converting from binary" do - Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/} + Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/} str = "an invalid binary string \xff".force_encoding('binary') # \ufffd == unicode replacement character - subject.utf8_string(str).should == "an invalid binary string \ufffd" + subject.utf8_string(str, "Error with command").should == "an invalid binary string \ufffd" end # InvalidByteSequenceError it "should replace undefined characters and warn if the string is invalid UTF-8" do - Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/} + Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/} str = "an invalid utf-8 string \xff".force_encoding('utf-8') - subject.utf8_string(str).should == "an invalid utf-8 string \ufffd" + subject.utf8_string(str, "Error with command").should == "an invalid utf-8 string \ufffd" end it "should leave the string alone if it's valid UTF-8" do Puppet.expects(:warning).never str = "a valid utf-8 string".force_encoding('utf-8') - subject.utf8_string(str).should == str + subject.utf8_string(str, nil).should == str + end + + it "should leave the string alone if it's valid UTF-8 with non-ascii characters" do + Puppet.expects(:warning).never + + str = "a valid utf-8 string Ö" + subject.utf8_string(str.dup.force_encoding('ASCII-8BIT'), nil).should == str + end + + describe "Debug log testing of bad data" do + let!(:existing_log_level){ Puppet[:log_level]} + + before :each do + Puppet[:log_level] = "debug" + end + + after :each do + Puppet[:log_level] = "notice" + end + + it "should emit a warning and debug messages when bad characters are found" do + Puppet[:log_level] = "debug" + Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'replace facts' command for host 'foo.com' ignoring invalid/} + Puppet.expects(:debug).with do |msg| + msg =~ /Error encoding a 'replace facts' command for host 'foo.com'/ && + msg =~ /'some valid string' followed by 1 invalid\/undefined bytes then ''/ + end + + # This will create a UTF-8 string literal, then switch to ASCII-8Bit when the bad + # bytes are concated on below + str = "some valid string" << [192].pack('c*') + subject.utf8_string(str, "Error encoding a 'replace facts' command for host 'foo.com'").should == "some valid string\ufffd" + end + end + + it "should emit a warning and no debug messages" do + Puppet.expects(:warning).with {|msg| msg =~ /Error on replace catalog ignoring invalid UTF-8 byte sequences/} + Puppet.expects(:debug).never + str = "some valid string" << [192].pack('c*') + subject.utf8_string(str, "Error on replace catalog").should == "some valid string\ufffd" end end end + describe "#all_indexes_of_char" do + described_class.all_indexes_of_char("a\u2192b\u2192c\u2192d\u2192", "\u2192").should == [1, 3, 5, 7] + described_class.all_indexes_of_char("abcd", "\u2192").should == [] + end + + describe "#collapse_ranges" do + described_class.collapse_ranges((1..5).to_a).should == [1..5] + described_class.collapse_ranges([]).should == [] + described_class.collapse_ranges([1,2,3,5,7,8,9]).should == [1..3, 5..5, 7..9] + end + + describe "#error_char_context" do + described_class.error_char_context("abc\ufffddef", [3]).should == + ["'abc' followed by 1 invalid/undefined bytes then 'def'"] + + described_class.error_char_context("abc\ufffd\ufffd\ufffd\ufffddef", [3,4,5,6]).should == + ["'abc' followed by 4 invalid/undefined bytes then 'def'"] + + described_class.error_char_context("abc\ufffddef\ufffdg", [3, 7]).should == + ["'abc' followed by 1 invalid/undefined bytes then 'def'", + "'def' followed by 1 invalid/undefined bytes then 'g'"] + end + + describe "#warn_if_invalid_chars" do + + end end diff --git a/puppet/spec/unit/util/puppetdb/command_spec.rb b/puppet/spec/unit/util/puppetdb/command_spec.rb index 8f7f15c0ae..dca236c86a 100644 --- a/puppet/spec/unit/util/puppetdb/command_spec.rb +++ b/puppet/spec/unit/util/puppetdb/command_spec.rb @@ -54,7 +54,42 @@ subject.submit end end + end + + it "should not warn when the the string contains valid UTF-8 characters" do + Puppet.expects(:warning).never + cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => "\u2192"}) + cmd.payload.include?("\u2192").should be_true + end + it "should warn when a command payload includes non-ascii UTF-8 characters" do + Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/} + cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')}) + cmd.payload.include?("\ufffd").should be_true end + describe "Debug log testing of bad data" do + let!(:existing_log_level){ Puppet[:log_level]} + + before :each do + Puppet[:log_level] = "debug" + end + + after :each do + Puppet[:log_level] = "notice" + end + + it "should warn when a command payload includes non-ascii UTF-8 characters" do + Puppet.expects(:warning).with do |msg| + msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/ + end + Puppet.expects(:debug).with do |msg| + msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain'/ && + msg =~ Regexp.new(Regexp.quote('"command":"command-1","version":1,"payload":{"foo"')) && + msg =~ /1 invalid\/undefined/ + end + cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')}) + cmd.payload.include?("\ufffd").should be_true + end + end end