puppetlabs · wkalt · Sep 15, 2015 · Sep 1, 2015 · ajroetker · Sep 11, 2015
diff --git a/puppet/lib/puppet/util/puppetdb/char_encoding.rb b/puppet/lib/puppet/util/puppetdb/char_encoding.rb
@@ -31,37 +31,141 @@ module CharEncoding
 
   Utf8ReplacementChar = [ 0xEF, 0xBF, 0xBD ].pack("c*")
 
+  DEFAULT_INVALID_CHAR = "\ufffd"
 
-  def self.utf8_string(str)
+  # @api private
+  def self.all_indexes_of_char(str, char)
+    (0..str.length).find_all{ |i| str[i] == char}
+  end
+
+  # @api private
+  #
+  # Takes an array and returns a sub-array without the last element
+  #
+  # @return [Object]
+  def self.drop_last(array)
+    array[0..-2]
+  end
+
+  # @api private
+  #
+  # Takes an array of increasing integers and collapses the sequential
+  # integers into ranges
+  #
+  # @param index_array an array of sorted integers
+  # @return [Range]
+  def self.collapse_ranges(index_array)
+    ranges = index_array.each.inject([]) do |spans, n|
+      if spans.empty? || spans.last.end != n - 1
+        spans << Range.new(n, n)
+      else
+        drop_last(spans) << Range.new(spans.last.begin,n)
+      end
+    end
+  end
+
+  # @api private
+  #
+  # Scans the string s with bad characters found at bad_char_indexes
+  # and returns an array of messages that give some context around the
+  # bad characters. This will give up to 100 characters prior to the
+  # bad character and 100 after. It will return fewer if it's at the
+  # beginning of a string or if another bad character appears before
+  # reaching the 100 characters
+  #
+  # @param str string coming from to_pson, likely a command to be submitted to PDB
+  # @param bad_char_indexes an array of indexes into the string where invalid characters were found
+  # @return [String]
+  def self.error_char_context(str, bad_char_indexes)
+    bad_char_ranges = collapse_ranges(bad_char_indexes)
+    bad_char_ranges.each_with_index.inject([]) do |state, (r, index)|
+      gap = r.to_a.length
+
+      prev_bad_char_end = bad_char_ranges[index-1].end + 1 if index > 0
+      next_bad_char_begin = bad_char_ranges[index+1].begin - 1 if index < bad_char_ranges.length - 1
+
+      start_char = [prev_bad_char_end || 0, r.begin-100].max
+      end_char = [next_bad_char_begin || str.length - 1, r.end+100].min
+      x = [next_bad_char_begin || str.length, r.end+100, str.length]
+      prefix = str[start_char..r.begin-1]
+      suffix = str[r.end+1..end_char]
+
+      state << "'#{prefix}' followed by #{gap} invalid/undefined bytes then '#{suffix}'"
+    end
+  end
+
+  # @api private
+  #
+  # Warns the user if an invalid character was found. If debugging is
+  # enabled will also log contextual information about where the bad
+  # character(s) were found
+  #
+  # @param str A string coming from to_pson, likely a command to be submitted to PDB
+  # @param error_context_str information about where this string came from for use in error messages
+  # @return String
+  def self.warn_if_invalid_chars(str, error_context_str)
+    bad_char_indexes = all_indexes_of_char(str, DEFAULT_INVALID_CHAR)
+    if bad_char_indexes.empty?
+      str
+    else
+      Puppet.warning "#{error_context_str} ignoring invalid UTF-8 byte sequences in data to be sent to PuppetDB, see debug logging for more info"
+      if Puppet.settings[:log_level] == "debug"
+        Puppet.debug error_context_str + "\n" + error_char_context(str, bad_char_indexes).join("\n")
+      end
+
+      str
+    end
+  end
+
+  # @api private
+  #
+  # Attempts to coerce str to UTF-8, if that fails will output context
+  # information using error_context_str
+  #
+  # @param str A string coming from to_pson, likely a command to be submitted to PDB
+  # @param error_context_str information about where this string came from for use in error messages
+  # @return Str
+  def self.coerce_to_utf8(str, error_context_str)
+    str_copy = str.dup
+    # This code is passed in a string that was created by
+    # to_pson. to_pson calls force_encoding('ASCII-8BIT') on the
+    # string before it returns it. This leaves the actual UTF-8 bytes
+    # alone. Below we check to see if this is the case (this should be
+    # most common). In this case, the bytes are still UTF-8 and we can
+    # just encode! and we're good to go. If They are not valid UTF-8
+    # bytes, that means there is probably some binary data mixed in
+    # the middle of the UTF-8 string. In this case we need to output a
+    # warning and give the user more information
+    str_copy.force_encoding("UTF-8")
+    if str_copy.valid_encoding?
+      str_copy.encode!("UTF-8")
+    else
+      # This is force_encoded as US-ASCII to avoid any overlapping
+      # byte related issues that could arise from mis-interpreting a
+      # random extra byte as part of a multi-byte UTF-8 character
+      str_copy.force_encoding("US-ASCII")
+      warn_if_invalid_chars(str_copy.encode!("UTF-8",
+                                             :invalid => :replace,
+                                             :undef => :replace,
+                                             :replace => DEFAULT_INVALID_CHAR),
+                            error_context_str)
+    end
+  end
+
+  def self.utf8_string(str, error_context_str)
     if RUBY_VERSION =~ /^1.8/
       # Ruby 1.8 doesn't have String#encode and related methods, and there
       #  appears to be a bug in iconv that will interpret some byte sequences
       #  as 6-byte characters.  Thus, we are forced to resort to some unfortunate
       #  manual chicanery.
       warn_if_changed(str, ruby18_clean_utf8(str))
-    elsif str.encoding == Encoding::UTF_8
-      # If we get here, we're in ruby 1.9+, so we have the string encoding methods
-      #  available.  However, just because  a ruby String object is already
-      #  marked as UTF-8, that doesn't guarantee that its contents are actually
-      #  valid; and if you call ruby's ".encode" method with an encoding of
-      #  "utf-8" for a String that ruby already believes is UTF-8, ruby
-      #  seems to optimize that to be a no-op.  So, we have to do some more
-      #  complex handling...
-
-      # If the string already has valid encoding then we're fine.
-      return str if str.valid_encoding?
-
-      # If not, we basically have to walk over the characters and replace
-      #  them by hand.
-      warn_if_changed(str, str.each_char.map { |c| c.valid_encoding? ? c : "\ufffd"}.join)
     else
-      # if we get here, we're ruby 1.9 and the current string is *not* encoded
-      #  as UTF-8.  Thus we can actually rely on ruby's "encode" method.
       begin
-        str.encode('UTF-8')
+        coerce_to_utf8(str, error_context_str)
       rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
-        # If we got an exception, the string is either invalid or not
-        # convertible to UTF-8, so drop those bytes.
+          # If we got an exception, the string is either invalid or not
+          # convertible to UTF-8, so drop those bytes.
+
         warn_if_changed(str, str.encode('UTF-8', :invalid => :replace, :undef => :replace))
       end
     end

diff --git a/puppet/lib/puppet/util/puppetdb/command.rb b/puppet/lib/puppet/util/puppetdb/command.rb
@@ -42,7 +42,7 @@ def initialize(command, version, certname, payload)
       #
       # This is roughly inline with how Puppet serializes for catalogs as of
       # Puppet 4.1.0. We need a better answer to non-utf8 data end-to-end.
-      }.to_pson)
+      }.to_pson, "Error encoding a '#{command}' command for host '#{certname}'")
     end
   end
 

diff --git a/puppet/spec/unit/util/puppetdb/char_encoding_spec.rb b/puppet/spec/unit/util/puppetdb/char_encoding_spec.rb
@@ -1,5 +1,5 @@
 #!/usr/bin/env rspec
-# encoding: UTF-8
+# encoding: utf-8
 
 require 'spec_helper'
 
@@ -79,28 +79,28 @@ def test_utf8_clean(in_bytes, expected_bytes)
         Puppet.expects(:warning).never
 
         str = "any ascii string"
-        subject.utf8_string(str).should == str
+        subject.utf8_string(str, nil).should == str
       end
 
       it "should strip invalid chars from non-overlapping latin-1 with a warning" do
         Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
 
         str = "a latin-1 string \xd6"
-        subject.utf8_string(str).should == "a latin-1 string "
+        subject.utf8_string(str, nil).should == "a latin-1 string "
       end
 
       it "should strip invalid chars and warn if the string is invalid UTF-8" do
         Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
 
         str = "an invalid utf-8 string \xff"
-        subject.utf8_string(str).should == "an invalid utf-8 string "
+        subject.utf8_string(str, nil).should == "an invalid utf-8 string "
       end
 
       it "should return a valid utf-8 string without warning" do
         Puppet.expects(:warning).never
 
         str = "a valid utf-8 string \xc3\x96"
-        subject.utf8_string(str).should == str
+        subject.utf8_string(str, nil).should == str
       end
     end
 
@@ -109,40 +109,106 @@ def test_utf8_clean(in_bytes, expected_bytes)
         Puppet.expects(:warning).never
 
         str = "any ascii string".force_encoding('us-ascii')
-        subject.utf8_string(str).should == str
+        subject.utf8_string(str, nil).should == str
       end
 
       it "should convert from latin-1 without a warning" do
         Puppet.expects(:warning).never
 
-        str = "a latin-1 string \xd6".force_encoding('iso-8859-1')
-        subject.utf8_string(str).should == "a latin-1 string Ö"
+        str = "a latin-1 string Ö".force_encoding('ASCII-8BIT')
+        subject.utf8_string(str, nil).should == "a latin-1 string Ö"
       end
 
       # UndefinedConversionError
       it "should replace undefined characters and warn when converting from binary" do
-        Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
+        Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}
 
         str = "an invalid binary string \xff".force_encoding('binary')
         # \ufffd == unicode replacement character
-        subject.utf8_string(str).should == "an invalid binary string \ufffd"
+        subject.utf8_string(str, "Error with command").should == "an invalid binary string \ufffd"
       end
 
       # InvalidByteSequenceError
       it "should replace undefined characters and warn if the string is invalid UTF-8" do
-        Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
+        Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}
 
         str = "an invalid utf-8 string \xff".force_encoding('utf-8')
-        subject.utf8_string(str).should == "an invalid utf-8 string \ufffd"
+        subject.utf8_string(str, "Error with command").should == "an invalid utf-8 string \ufffd"
       end
 
       it "should leave the string alone if it's valid UTF-8" do
         Puppet.expects(:warning).never
 
         str = "a valid utf-8 string".force_encoding('utf-8')
-        subject.utf8_string(str).should == str
+        subject.utf8_string(str, nil).should == str
+      end
+
+      it "should leave the string alone if it's valid UTF-8 with non-ascii characters" do
+        Puppet.expects(:warning).never
+
+        str = "a valid utf-8 string Ö"
+        subject.utf8_string(str.dup.force_encoding('ASCII-8BIT'), nil).should == str
+      end
+
+      describe "Debug log testing of bad data" do
+        let!(:existing_log_level){ Puppet[:log_level]}
+
+        before :each do
+          Puppet[:log_level] = "debug"
+        end
+
+        after :each do
+          Puppet[:log_level] = "notice"
+        end
+
+        it "should emit a warning and debug messages when bad characters are found" do
+          Puppet[:log_level] = "debug"
+          Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'replace facts' command for host 'foo.com' ignoring invalid/}
+          Puppet.expects(:debug).with do |msg|
+            msg =~ /Error encoding a 'replace facts' command for host 'foo.com'/ &&
+            msg =~ /'some valid string' followed by 1 invalid\/undefined bytes then ''/
+          end
+
+          # This will create a UTF-8 string literal, then switch to ASCII-8Bit when the bad
+          # bytes are concated on below
+          str = "some valid string" << [192].pack('c*')
+          subject.utf8_string(str, "Error encoding a 'replace facts' command for host 'foo.com'").should == "some valid string\ufffd"
+        end
+      end
+
+      it "should emit a warning and no debug messages" do
+        Puppet.expects(:warning).with {|msg| msg =~ /Error on replace catalog ignoring invalid UTF-8 byte sequences/}
+        Puppet.expects(:debug).never
+        str = "some valid string" << [192].pack('c*')
+        subject.utf8_string(str, "Error on replace catalog").should == "some valid string\ufffd"
       end
     end
   end
 
+  describe "#all_indexes_of_char" do
+    described_class.all_indexes_of_char("a\u2192b\u2192c\u2192d\u2192", "\u2192").should == [1, 3, 5, 7]
+    described_class.all_indexes_of_char("abcd", "\u2192").should == []
+  end
+
+  describe "#collapse_ranges" do
+    described_class.collapse_ranges((1..5).to_a).should == [1..5]
+    described_class.collapse_ranges([]).should == []
+    described_class.collapse_ranges([1,2,3,5,7,8,9]).should == [1..3, 5..5, 7..9]
+  end
+
+  describe "#error_char_context" do
+    described_class.error_char_context("abc\ufffddef", [3]).should ==
+      ["'abc' followed by 1 invalid/undefined bytes then 'def'"]
+
+    described_class.error_char_context("abc\ufffd\ufffd\ufffd\ufffddef", [3,4,5,6]).should ==
+      ["'abc' followed by 4 invalid/undefined bytes then 'def'"]
+
+    described_class.error_char_context("abc\ufffddef\ufffdg", [3, 7]).should ==
+      ["'abc' followed by 1 invalid/undefined bytes then 'def'",
+       "'def' followed by 1 invalid/undefined bytes then 'g'"]
+  end
+
+  describe "#warn_if_invalid_chars" do
+
+  end
 end
diff --git a/puppet/spec/unit/util/puppetdb/command_spec.rb b/puppet/spec/unit/util/puppetdb/command_spec.rb
@@ -54,7 +54,42 @@
         subject.submit
       end
     end
+  end
+
+  it "should not warn when the the string contains valid UTF-8 characters" do
+    Puppet.expects(:warning).never
+    cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => "\u2192"})
+    cmd.payload.include?("\u2192").should be_true
+  end
 
+  it "should warn when a command payload includes non-ascii UTF-8 characters" do
+    Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/}
+    cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
+    cmd.payload.include?("\ufffd").should be_true
   end
 
+  describe "Debug log testing of bad data" do
+    let!(:existing_log_level){ Puppet[:log_level]}
+
+    before :each do
+      Puppet[:log_level] = "debug"
+    end
+
+    after :each do
+      Puppet[:log_level] = "notice"
+    end
+
+    it "should warn when a command payload includes non-ascii UTF-8 characters" do
+      Puppet.expects(:warning).with do |msg|
+        msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/
+      end
+      Puppet.expects(:debug).with do |msg|
+        msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain'/ &&
+        msg =~ Regexp.new(Regexp.quote('"command":"command-1","version":1,"payload":{"foo"')) &&
+        msg =~ /1 invalid\/undefined/
+      end
+      cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
+      cmd.payload.include?("\ufffd").should be_true
+    end
+  end
 end