Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 125 additions & 21 deletions puppet/lib/puppet/util/puppetdb/char_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,37 +31,141 @@ module CharEncoding

Utf8ReplacementChar = [ 0xEF, 0xBF, 0xBD ].pack("c*")

DEFAULT_INVALID_CHAR = "\ufffd"

def self.utf8_string(str)
# @api private
def self.all_indexes_of_char(str, char)
(0..str.length).find_all{ |i| str[i] == char}
end

# @api private
#
# Takes an array and returns a sub-array without the last element
#
# @return [Object]
def self.drop_last(array)
array[0..-2]
end
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(0 ... str.length).find_all { |i| str[i] == char }

A little bit simpler


# @api private
#
# Takes an array of increasing integers and collapses the sequential
# integers into ranges
#
# @param index_array an array of sorted integers
# @return [Range]
def self.collapse_ranges(index_array)
ranges = index_array.each.inject([]) do |spans, n|
if spans.empty? || spans.last.end != n - 1
spans << Range.new(n, n)
else
drop_last(spans) << Range.new(spans.last.begin,n)
end
end
end
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stolen from stackexchange

ranges = a.sort.uniq.inject([]) do |spans, n|
  if spans.empty? || spans.last.last != n - 1
    spans + Range.new(n,n)
  else
    spans[0..-2] + Range.new(spans.last.first,n)
  end
end

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's the monkey version

class Array
  # splits array to sub-arrays wherever two adjacent elements satisfy a condition
  def split_by
    each_cons(2).inject([[first]]){|a, (i, j)|
      a.push([]) if yield(i, j)
      a.last.push j
      a
    }
  end

  # uses split_by to split array to subarrays with consecutive elements, then convert to range
  def to_range
    split_by{|i,j| j-i!=1}.map{|a| a.first..a.last}
  end
end


# @api private
#
# Scans the string s with bad characters found at bad_char_indexes
# and returns an array of messages that give some context around the
# bad characters. This will give up to 100 characters prior to the
# bad character and 100 after. It will return fewer if it's at the
# beginning of a string or if another bad character appears before
# reaching the 100 characters
#
# @param str string coming from to_pson, likely a command to be submitted to PDB
# @param bad_char_indexes an array of indexes into the string where invalid characters were found
# @return [String]
def self.error_char_context(str, bad_char_indexes)
bad_char_ranges = collapse_ranges(bad_char_indexes)
bad_char_ranges.each_with_index.inject([]) do |state, (r, index)|
gap = r.to_a.length

prev_bad_char_end = bad_char_ranges[index-1].end + 1 if index > 0
next_bad_char_begin = bad_char_ranges[index+1].begin - 1 if index < bad_char_ranges.length - 1

start_char = [prev_bad_char_end || 0, r.begin-100].max
end_char = [next_bad_char_begin || str.length - 1, r.end+100].min
x = [next_bad_char_begin || str.length, r.end+100, str.length]
prefix = str[start_char..r.begin-1]
suffix = str[r.end+1..end_char]

state << "'#{prefix}' followed by #{gap} invalid/undefined bytes then '#{suffix}'"
end
end

# @api private
#
# Warns the user if an invalid character was found. If debugging is
# enabled will also log contextual information about where the bad
# character(s) were found
#
# @param str A string coming from to_pson, likely a command to be submitted to PDB
# @param error_context_str information about where this string came from for use in error messages
# @return String
def self.warn_if_invalid_chars(str, error_context_str)
bad_char_indexes = all_indexes_of_char(str, DEFAULT_INVALID_CHAR)
if bad_char_indexes.empty?
str
else
Puppet.warning "#{error_context_str} ignoring invalid UTF-8 byte sequences in data to be sent to PuppetDB, see debug logging for more info"
if Puppet.settings[:log_level] == "debug"
Puppet.debug error_context_str + "\n" + error_char_context(str, bad_char_indexes).join("\n")
end

str
end
end

# @api private
#
# Attempts to coerce str to UTF-8, if that fails will output context
# information using error_context_str
#
# @param str A string coming from to_pson, likely a command to be submitted to PDB
# @param error_context_str information about where this string came from for use in error messages
# @return Str
def self.coerce_to_utf8(str, error_context_str)
str_copy = str.dup
# This code is passed in a string that was created by
# to_pson. to_pson calls force_encoding('ASCII-8BIT') on the
# string before it returns it. This leaves the actual UTF-8 bytes
# alone. Below we check to see if this is the case (this should be
# most common). In this case, the bytes are still UTF-8 and we can
# just encode! and we're good to go. If They are not valid UTF-8
# bytes, that means there is probably some binary data mixed in
# the middle of the UTF-8 string. In this case we need to output a
# warning and give the user more information
str_copy.force_encoding("UTF-8")
if str_copy.valid_encoding?
str_copy.encode!("UTF-8")
else
# This is force_encoded as US-ASCII to avoid any overlapping
# byte related issues that could arise from mis-interpreting a
# random extra byte as part of a multi-byte UTF-8 character
str_copy.force_encoding("US-ASCII")
warn_if_invalid_chars(str_copy.encode!("UTF-8",
:invalid => :replace,
:undef => :replace,
:replace => DEFAULT_INVALID_CHAR),
error_context_str)
end
end

def self.utf8_string(str, error_context_str)
if RUBY_VERSION =~ /^1.8/
# Ruby 1.8 doesn't have String#encode and related methods, and there
# appears to be a bug in iconv that will interpret some byte sequences
# as 6-byte characters. Thus, we are forced to resort to some unfortunate
# manual chicanery.
warn_if_changed(str, ruby18_clean_utf8(str))
elsif str.encoding == Encoding::UTF_8
# If we get here, we're in ruby 1.9+, so we have the string encoding methods
# available. However, just because a ruby String object is already
# marked as UTF-8, that doesn't guarantee that its contents are actually
# valid; and if you call ruby's ".encode" method with an encoding of
# "utf-8" for a String that ruby already believes is UTF-8, ruby
# seems to optimize that to be a no-op. So, we have to do some more
# complex handling...

# If the string already has valid encoding then we're fine.
return str if str.valid_encoding?

# If not, we basically have to walk over the characters and replace
# them by hand.
warn_if_changed(str, str.each_char.map { |c| c.valid_encoding? ? c : "\ufffd"}.join)
else
# if we get here, we're ruby 1.9 and the current string is *not* encoded
# as UTF-8. Thus we can actually rely on ruby's "encode" method.
begin
str.encode('UTF-8')
coerce_to_utf8(str, error_context_str)
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
# If we got an exception, the string is either invalid or not
# convertible to UTF-8, so drop those bytes.
# If we got an exception, the string is either invalid or not
# convertible to UTF-8, so drop those bytes.

warn_if_changed(str, str.encode('UTF-8', :invalid => :replace, :undef => :replace))
end
end
Expand Down
2 changes: 1 addition & 1 deletion puppet/lib/puppet/util/puppetdb/command.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def initialize(command, version, certname, payload)
#
# This is roughly inline with how Puppet serializes for catalogs as of
# Puppet 4.1.0. We need a better answer to non-utf8 data end-to-end.
}.to_pson)
}.to_pson, "Error encoding a '#{command}' command for host '#{certname}'")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should I see this getting logged with this catalog?

exec {"\u006E\u0303":
  command => "/usr/bin/uptime"
}

exec {"\u00F1":
  command => "/usr/bin/date"
}

I see this:

Warning: Ignoring invalid UTF-8 byte sequences in data to be sent to PuppetDB

in the puppet log, but no mention of replace catalog specifically.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh sorry, not on your code

end
end

Expand Down
92 changes: 79 additions & 13 deletions puppet/spec/unit/util/puppetdb/char_encoding_spec.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env rspec
# encoding: UTF-8
# encoding: utf-8

require 'spec_helper'

Expand Down Expand Up @@ -79,28 +79,28 @@ def test_utf8_clean(in_bytes, expected_bytes)
Puppet.expects(:warning).never

str = "any ascii string"
subject.utf8_string(str).should == str
subject.utf8_string(str, nil).should == str
end

it "should strip invalid chars from non-overlapping latin-1 with a warning" do
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}

str = "a latin-1 string \xd6"
subject.utf8_string(str).should == "a latin-1 string "
subject.utf8_string(str, nil).should == "a latin-1 string "
end

it "should strip invalid chars and warn if the string is invalid UTF-8" do
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}

str = "an invalid utf-8 string \xff"
subject.utf8_string(str).should == "an invalid utf-8 string "
subject.utf8_string(str, nil).should == "an invalid utf-8 string "
end

it "should return a valid utf-8 string without warning" do
Puppet.expects(:warning).never

str = "a valid utf-8 string \xc3\x96"
subject.utf8_string(str).should == str
subject.utf8_string(str, nil).should == str
end
end

Expand All @@ -109,40 +109,106 @@ def test_utf8_clean(in_bytes, expected_bytes)
Puppet.expects(:warning).never

str = "any ascii string".force_encoding('us-ascii')
subject.utf8_string(str).should == str
subject.utf8_string(str, nil).should == str
end

it "should convert from latin-1 without a warning" do
Puppet.expects(:warning).never

str = "a latin-1 string \xd6".force_encoding('iso-8859-1')
subject.utf8_string(str).should == "a latin-1 string Ö"
str = "a latin-1 string Ö".force_encoding('ASCII-8BIT')
subject.utf8_string(str, nil).should == "a latin-1 string Ö"
end

# UndefinedConversionError
it "should replace undefined characters and warn when converting from binary" do
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}

str = "an invalid binary string \xff".force_encoding('binary')
# \ufffd == unicode replacement character
subject.utf8_string(str).should == "an invalid binary string \ufffd"
subject.utf8_string(str, "Error with command").should == "an invalid binary string \ufffd"
end

# InvalidByteSequenceError
it "should replace undefined characters and warn if the string is invalid UTF-8" do
Puppet.expects(:warning).with {|msg| msg =~ /Ignoring invalid UTF-8 byte sequences/}
Puppet.expects(:warning).with {|msg| msg =~ /Error with command ignoring invalid UTF-8 byte sequences/}

str = "an invalid utf-8 string \xff".force_encoding('utf-8')
subject.utf8_string(str).should == "an invalid utf-8 string \ufffd"
subject.utf8_string(str, "Error with command").should == "an invalid utf-8 string \ufffd"
end

it "should leave the string alone if it's valid UTF-8" do
Puppet.expects(:warning).never

str = "a valid utf-8 string".force_encoding('utf-8')
subject.utf8_string(str).should == str
subject.utf8_string(str, nil).should == str
end

it "should leave the string alone if it's valid UTF-8 with non-ascii characters" do
Puppet.expects(:warning).never

str = "a valid utf-8 string Ö"
subject.utf8_string(str.dup.force_encoding('ASCII-8BIT'), nil).should == str
end

describe "Debug log testing of bad data" do
let!(:existing_log_level){ Puppet[:log_level]}

before :each do
Puppet[:log_level] = "debug"
end

after :each do
Puppet[:log_level] = "notice"
end

it "should emit a warning and debug messages when bad characters are found" do
Puppet[:log_level] = "debug"
Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'replace facts' command for host 'foo.com' ignoring invalid/}
Puppet.expects(:debug).with do |msg|
msg =~ /Error encoding a 'replace facts' command for host 'foo.com'/ &&
msg =~ /'some valid string' followed by 1 invalid\/undefined bytes then ''/
end

# This will create a UTF-8 string literal, then switch to ASCII-8Bit when the bad
# bytes are concated on below
str = "some valid string" << [192].pack('c*')
subject.utf8_string(str, "Error encoding a 'replace facts' command for host 'foo.com'").should == "some valid string\ufffd"
end
end

it "should emit a warning and no debug messages" do
Puppet.expects(:warning).with {|msg| msg =~ /Error on replace catalog ignoring invalid UTF-8 byte sequences/}
Puppet.expects(:debug).never
str = "some valid string" << [192].pack('c*')
subject.utf8_string(str, "Error on replace catalog").should == "some valid string\ufffd"
end
end
end

describe "#all_indexes_of_char" do
described_class.all_indexes_of_char("a\u2192b\u2192c\u2192d\u2192", "\u2192").should == [1, 3, 5, 7]
described_class.all_indexes_of_char("abcd", "\u2192").should == []
end

describe "#collapse_ranges" do
described_class.collapse_ranges((1..5).to_a).should == [1..5]
described_class.collapse_ranges([]).should == []
described_class.collapse_ranges([1,2,3,5,7,8,9]).should == [1..3, 5..5, 7..9]
end

describe "#error_char_context" do
described_class.error_char_context("abc\ufffddef", [3]).should ==
["'abc' followed by 1 invalid/undefined bytes then 'def'"]

described_class.error_char_context("abc\ufffd\ufffd\ufffd\ufffddef", [3,4,5,6]).should ==
["'abc' followed by 4 invalid/undefined bytes then 'def'"]

described_class.error_char_context("abc\ufffddef\ufffdg", [3, 7]).should ==
["'abc' followed by 1 invalid/undefined bytes then 'def'",
"'def' followed by 1 invalid/undefined bytes then 'g'"]
end

describe "#warn_if_invalid_chars" do

end
end
35 changes: 35 additions & 0 deletions puppet/spec/unit/util/puppetdb/command_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,42 @@
subject.submit
end
end
end

it "should not warn when the the string contains valid UTF-8 characters" do
Puppet.expects(:warning).never
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => "\u2192"})
cmd.payload.include?("\u2192").should be_true
end

it "should warn when a command payload includes non-ascii UTF-8 characters" do
Puppet.expects(:warning).with {|msg| msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/}
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
cmd.payload.include?("\ufffd").should be_true
end

describe "Debug log testing of bad data" do
let!(:existing_log_level){ Puppet[:log_level]}

before :each do
Puppet[:log_level] = "debug"
end

after :each do
Puppet[:log_level] = "notice"
end

it "should warn when a command payload includes non-ascii UTF-8 characters" do
Puppet.expects(:warning).with do |msg|
msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain' ignoring invalid UTF-8 byte sequences/
end
Puppet.expects(:debug).with do |msg|
msg =~ /Error encoding a 'command-1' command for host 'foo.localdomain'/ &&
msg =~ Regexp.new(Regexp.quote('"command":"command-1","version":1,"payload":{"foo"')) &&
msg =~ /1 invalid\/undefined/
end
cmd = described_class.new("command-1", 1, "foo.localdomain", {"foo" => [192].pack('c*')})
cmd.payload.include?("\ufffd").should be_true
end
end
end