(PDB-135) Improve and reduce the number of UTF-8 warnings

Ryan Senior · Ryan Senior · commit d9ae1d768acb · 2015-09-15T09:59:21.000-05:00
Prior to this commit, any non-ASCII character found in a catalog would
be replaced by /ufffd and a warning would be issued saying it found an
invalid character. The code now takes into account the force_encoding
done by to_pson and attempts to force_encode it back to UTF-8. For most
cases this should be sufficient.

There are times when binary data can appear in a catalog. In this case,
we have characters that we can't represent in UTF-8. For this we
continue to give users a warning. This warning now includes the command
being submitted and the node the command is for. If users have debug
logging enabled, context is given around where this bad character data
occurred. Up to 100 characters before/after this bad data is included in
the Puppet debug log.
diff --git a/puppet/lib/puppet/util/puppetdb/char_encoding.rb b/puppet/lib/puppet/util/puppetdb/char_encoding.rb
@@ -31,37 +31,141 @@ module CharEncoding
 
   Utf8ReplacementChar = [ 0xEF, 0xBF, 0xBD ].pack("c*")
 
+  DEFAULT_INVALID_CHAR = "\ufffd"
 
-  def self.utf8_string(str)
+  # @api private
+  def self.all_indexes_of_char(str, char)
+    (0..str.length).find_all{ |i| str[i] == char}
+  end
+
+  # @api private
+  #
+  # Takes an array and returns a sub-array without the last element
+  #
+  # @return [Object]
+  def self.drop_last(array)
+    array[0..-2]
+  end
+
+  # @api private
+  #
+  # Takes an array of increasing integers and collapses the sequential
+  # integers into ranges
+  #
+  # @param index_array an array of sorted integers
+  # @return [Range]
+  def self.collapse_ranges(index_array)
+    ranges = index_array.each.inject([]) do |spans, n|
+      if spans.empty? || spans.last.end != n - 1
+        spans << Range.new(n, n)
+      else
+        drop_last(spans) << Range.new(spans.last.begin,n)
+      end
+    end
+  end
+
+  # @api private
+  #
+  # Scans the string s with bad characters found at bad_char_indexes
+  # and returns an array of messages that give some context around the
+  # bad characters. This will give up to 100 characters prior to the
+  # bad character and 100 after. It will return fewer if it's at the
+  # beginning of a string or if another bad character appears before
+  # reaching the 100 characters
+  #
+  # @param str string coming from to_pson, likely a command to be submitted to PDB
+  # @param bad_char_indexes an array of indexes into the string where invalid characters were found
+  # @return [String]
+  def self.error_char_context(str, bad_char_indexes)
+    bad_char_ranges = collapse_ranges(bad_char_indexes)
+    bad_char_ranges.each_with_index.inject([]) do |state, (r, index)|
+      gap = r.to_a.length
+
+      prev_bad_char_end = bad_char_ranges[index-1].end + 1 if index > 0
+      next_bad_char_begin = bad_char_ranges[index+1].begin - 1 if index < bad_char_ranges.length - 1
+
+      start_char = [prev_bad_char_end || 0, r.begin-100].max
+      end_char = [next_bad_char_begin || str.length - 1, r.end+100].min
+      x = [next_bad_char_begin || str.length, r.end+100, str.length]
+      prefix = str[start_char..r.begin-1]
+      suffix = str[r.end+1..end_char]
+
+      state << "'#{prefix}' followed by #{gap} invalid/undefined bytes then '#{suffix}'"
+    end
+  end
+
+  # @api private
+  #
+  # Warns the user if an invalid character was found. If debugging is
+  # enabled will also log contextual information about where the bad
+  # character(s) were found
+  #
+  # @param str A string coming from to_pson, likely a command to be submitted to PDB
+  # @param error_context_str information about where this string came from for use in error messages
+  # @return String
+  def self.warn_if_invalid_chars(str, error_context_str)
+    bad_char_indexes = all_indexes_of_char(str, DEFAULT_INVALID_CHAR)
+    if bad_char_indexes.empty?
+      str
+    else
+      Puppet.warning "#{error_context_str} ignoring invalid UTF-8 byte sequences in data to be sent to PuppetDB, see debug logging for more info"
+      if Puppet.settings[:log_level] == "debug"
+        Puppet.debug error_context_str + "\n" + error_char_context(str, bad_char_indexes).join("\n")
+      end
+
+      str
+    end
+  end
+
+  # @api private
+  #
+  # Attempts to coerce str to UTF-8, if that fails will output context
+  # information using error_context_str
+  #
+  # @param str A string coming from to_pson, likely a command to be submitted to PDB
+  # @param error_context_str information about where this string came from for use in error messages
+  # @return Str
+  def self.coerce_to_utf8(str, error_context_str)
+    str_copy = str.dup
+    # This code is passed in a string that was created by
+    # to_pson. to_pson calls force_encoding('ASCII-8BIT') on the
+    # string before it returns it. This leaves the actual UTF-8 bytes
+    # alone. Below we check to see if this is the case (this should be
+    # most common). In this case, the bytes are still UTF-8 and we can
+    # just encode! and we're good to go. If They are not valid UTF-8
+    # bytes, that means there is probably some binary data mixed in
+    # the middle of the UTF-8 string. In this case we need to output a
+    # warning and give the user more information
+    str_copy.force_encoding("UTF-8")
+    if str_copy.valid_encoding?
+      str_copy.encode!("UTF-8")
+    else
+      # This is force_encoded as US-ASCII to avoid any overlapping
+      # byte related issues that could arise from mis-interpreting a
+      # random extra byte as part of a multi-byte UTF-8 character
+      str_copy.force_encoding("US-ASCII")
+      warn_if_invalid_chars(str_copy.encode!("UTF-8",
+                                             :invalid => :replace,
+                                             :undef => :replace,
+                                             :replace => DEFAULT_INVALID_CHAR),
+                            error_context_str)
+    end
+  end
+
+  def self.utf8_string(str, error_context_str)
     if RUBY_VERSION =~ /^1.8/
       # Ruby 1.8 doesn't have String#encode and related methods, and there
       #  appears to be a bug in iconv that will interpret some byte sequences
       #  as 6-byte characters.  Thus, we are forced to resort to some unfortunate
       #  manual chicanery.
       warn_if_changed(str, ruby18_clean_utf8(str))
-    elsif str.encoding == Encoding::UTF_8
-      # If we get here, we're in ruby 1.9+, so we have the string encoding methods
-      #  available.  However, just because  a ruby String object is already
-      #  marked as UTF-8, that doesn't guarantee that its contents are actually
-      #  valid; and if you call ruby's ".encode" method with an encoding of
-      #  "utf-8" for a String that ruby already believes is UTF-8, ruby
-      #  seems to optimize that to be a no-op.  So, we have to do some more
-      #  complex handling...
-
-      # If the string already has valid encoding then we're fine.
-      return str if str.valid_encoding?
-
-      # If not, we basically have to walk over the characters and replace
-      #  them by hand.
-      warn_if_changed(str, str.each_char.map { |c| c.valid_encoding? ? c : "\ufffd"}.join)
     else
-      # if we get here, we're ruby 1.9 and the current string is *not* encoded
-      #  as UTF-8.  Thus we can actually rely on ruby's "encode" method.
       begin
-        str.encode('UTF-8')
+        coerce_to_utf8(str, error_context_str)
       rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
-        # If we got an exception, the string is either invalid or not
-        # convertible to UTF-8, so drop those bytes.
+          # If we got an exception, the string is either invalid or not
+          # convertible to UTF-8, so drop those bytes.
+
         warn_if_changed(str, str.encode('UTF-8', :invalid => :replace, :undef => :replace))
       end
     end
diff --git a/puppet/lib/puppet/util/puppetdb/command.rb b/puppet/lib/puppet/util/puppetdb/command.rb
@@ -42,7 +42,7 @@ def initialize(command, version, certname, payload)
       #
       # This is roughly inline with how Puppet serializes for catalogs as of
       # Puppet 4.1.0. We need a better answer to non-utf8 data end-to-end.
-      }.to_pson)
+      }.to_pson, "Error encoding a '#{command}' command for host '#{certname}'")
     end
   end
 
diff --git a/puppet/spec/unit/util/puppetdb/char_encoding_spec.rb b/puppet/spec/unit/util/puppetdb/char_encoding_spec.rb
@@ -1,5 +1,5 @@
 #!/usr/bin/env rspec
-# encoding: UTF-8
+# encoding: utf-8
 
 require 'spec_helper'
 
@@ -79,28 +79,28 @@ def test_utf8_clean(in_bytes, expected_bytes)
         Puppet.expects(:warning).never
 
         str = "any ascii string"
-        subject.utf8_string(str).should == str
+        subject.utf8_string(str, nil).should == str
       end
 
       it "should strip invalid chars from non-overlapping latin-1 with a warning" do
         Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
 
         str = "a latin-1 string \xd6"
-        subject.utf8_string(str).should == "a latin-1 string "
+        subject.utf8_string(str, nil).should == "a latin-1 string "
       end
 
       it "should strip invalid chars and warn if the string is invalid UTF-8" do
         Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
 
         str = "an invalid utf-8 string \xff"
-        subject.utf8_string(str).should == "an invalid utf-8 string "
+        subject.utf8_string(str, nil).should == "an invalid utf-8 string "
       end
 
       it "should return a valid utf-8 string without warning" do
         Puppet.expects(:warning).never
 
         str = "a valid utf-8 string \xc3\x96"
-        subject.utf8_string(str).should == str
+        subject.utf8_string(str, nil).should == str
       end
     end
 
@@ -109,40 +109,106 @@ def test_utf8_clean(in_bytes, expected_bytes)
         Puppet.expects(:warning).never
 
         str = "any ascii string".force_encoding('us-ascii')
-        subject.utf8_string(str).should == str
+        subject.utf8_string(str, nil).should == str
       end
 
       it "should convert from latin-1 without a warning" do
         Puppet.expects(:warning).never
 
-        str = "a latin-1 string \xd6".force_encoding('iso-8859-1')
-        subject.utf8_string(str).should == "a latin-1 string Ö"
+        str = "a latin-1 string Ö".force_encoding('ASCII-8BIT')
+        subject.utf8_string(str, nil).should == "a latin-1 string Ö"
       end
 
       # UndefinedConversionError
       it "should replace undefined characters and warn when converting from binary" do
-        Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
+        Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}
 
         str = "an invalid binary string \xff".force_encoding('binary')
         # \ufffd == unicode replacement character
-        subject.utf8_string(str).should == "an invalid binary string \ufffd"
+        subject.utf8_string(str, "Error with command").should == "an invalid binary string \ufffd"
       end
 
       # InvalidByteSequenceError
       it "should replace undefined characters and warn if the string is invalid UTF-8" do
-        Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
+        Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}
 
         str = "an invalid utf-8 string \xff".force_encoding('utf-8')
-        subject.utf8_string(str).should == "an invalid utf-8 string \ufffd"
+        subject.utf8_string(str, "Error with command").should == "an invalid utf-8 string \ufffd"
       end
 
       it "should leave the string alone if it's valid UTF-8" do
         Puppet.expects(:warning).never
 
         str = "a valid utf-8 string".force_encoding('utf-8')
-        subject.utf8_string(str).should == str
+        subject.utf8_string(str, nil).should == str
+      end
+
+      it "should leave the string alone if it's valid UTF-8 with non-ascii characters" do
+        Puppet.expects(:warning).never
+
+        str = "a valid utf-8 string Ö"
+        subject.utf8_string(str.dup.force_encoding('ASCII-8BIT'), nil).should == str
+      end
+
+      describe "Debug log testing of bad data" do
+        let!(:existing_log_level){ Puppet[:log_level]}
+
+        before :each do
+          Puppet[:log_level] = "debug"
+        end
+
+        after :each do
+          Puppet[:log_level] = "notice"
+        end
+
+        it "should emit a warning and debug messages when bad characters are found" do
+          Puppet[:log_level] = "debug"
+          Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'replace facts' command for host 'foo.com' ignoring invalid/}
+          Puppet.expects(:debug).with do |msg|
+            msg =~ /Error encoding a 'replace facts' command for host 'foo.com'/ &&
+            msg =~ /'some valid string' followed by 1 invalid\/undefined bytes then ''/
+          end
+
+          # This will create a UTF-8 string literal, then switch to ASCII-8Bit when the bad
+          # bytes are concated on below
+          str = "some valid string" << [192].pack('c*')
+          subject.utf8_string(str, "Error encoding a 'replace facts' command for host 'foo.com'").should == "some valid string\ufffd"
+        end
+      end
+
+      it "should emit a warning and no debug messages" do
+        Puppet.expects(:warning).with {|msg| msg =~ /Error on replace catalog ignoring invalid UTF-8 byte sequences/}
+        Puppet.expects(:debug).never
+        str = "some valid string" << [192].pack('c*')
+        subject.utf8_string(str, "Error on replace catalog").should == "some valid string\ufffd"
       end
     end
   end
 
+  describe "#all_indexes_of_char" do
+    described_class.all_indexes_of_char("a\u2192b\u2192c\u2192d\u2192", "\u2192").should == [1, 3, 5, 7]
+    described_class.all_indexes_of_char("abcd", "\u2192").should == []
+  end
+
+  describe "#collapse_ranges" do
+    described_class.collapse_ranges((1..5).to_a).should == [1..5]
+    described_class.collapse_ranges([]).should == []
+    described_class.collapse_ranges([1,2,3,5,7,8,9]).should == [1..3, 5..5, 7..9]
+  end
+
+  describe "#error_char_context" do
+    described_class.error_char_context("abc\ufffddef", [3]).should ==
+      ["'abc' followed by 1 invalid/undefined bytes then 'def'"]
+
+    described_class.error_char_context("abc\ufffd\ufffd\ufffd\ufffddef", [3,4,5,6]).should ==
+      ["'abc' followed by 4 invalid/undefined bytes then 'def'"]
+
+    described_class.error_char_context("abc\ufffddef\ufffdg", [3, 7]).should ==
+      ["'abc' followed by 1 invalid/undefined bytes then 'def'",
+       "'def' followed by 1 invalid/undefined bytes then 'g'"]
+  end
+
+  describe "#warn_if_invalid_chars" do
+
+  end
 end
diff --git a/puppet/spec/unit/util/puppetdb/command_spec.rb b/puppet/spec/unit/util/puppetdb/command_spec.rb
@@ -54,7 +54,42 @@
         subject.submit
       end
     end
+  end
+
+  it "should not warn when the the string contains valid UTF-8 characters" do
+    Puppet.expects(:warning).never
+    cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => "\u2192"})
+    cmd.payload.include?("\u2192").should be_true
+  end
 
+  it "should warn when a command payload includes non-ascii UTF-8 characters" do
+    Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/}
+    cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
+    cmd.payload.include?("\ufffd").should be_true
   end
 
+  describe "Debug log testing of bad data" do
+    let!(:existing_log_level){ Puppet[:log_level]}
+
+    before :each do
+      Puppet[:log_level] = "debug"
+    end
+
+    after :each do
+      Puppet[:log_level] = "notice"
+    end
+
+    it "should warn when a command payload includes non-ascii UTF-8 characters" do
+      Puppet.expects(:warning).with do |msg|
+        msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/
+      end
+      Puppet.expects(:debug).with do |msg|
+        msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain'/ &&
+        msg =~ Regexp.new(Regexp.quote('"command":"command-1","version":1,"payload":{"foo"')) &&
+        msg =~ /1 invalid\/undefined/
+      end
+      cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
+      cmd.payload.include?("\ufffd").should be_true
+    end
+  end
 end

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ def initialize(command, version, certname, payload)`
`42`	`42`	`#`
`43`	`43`	`# This is roughly inline with how Puppet serializes for catalogs as of`
`44`	`44`	`# Puppet 4.1.0. We need a better answer to non-utf8 data end-to-end.`
`45`		`- }.to_pson)`
	`45`	`+ }.to_pson, "Error encoding a '#{command}' command for host '#{certname}'")`
`46`	`46`	`end`
`47`	`47`	`end`
`48`	`48`