| [1329] | 1 | diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb |
|---|
| 2 | index 65a96af..b6354ee 100644 |
|---|
| 3 | --- a/activesupport/lib/active_support/multibyte.rb |
|---|
| 4 | +++ b/activesupport/lib/active_support/multibyte.rb |
|---|
| 5 | @@ -1,9 +1,5 @@ |
|---|
| 6 | # encoding: utf-8 |
|---|
| 7 | |
|---|
| 8 | -require 'active_support/multibyte/chars' |
|---|
| 9 | -require 'active_support/multibyte/exceptions' |
|---|
| 10 | -require 'active_support/multibyte/unicode_database' |
|---|
| 11 | - |
|---|
| 12 | module ActiveSupport #:nodoc: |
|---|
| 13 | module Multibyte |
|---|
| 14 | # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more |
|---|
| 15 | @@ -27,7 +23,35 @@ module ActiveSupport #:nodoc: |
|---|
| 16 | # |
|---|
| 17 | # Example: |
|---|
| 18 | # ActiveSupport::Multibyte.proxy_class = CharsForUTF32 |
|---|
| 19 | - mattr_accessor :proxy_class |
|---|
| 20 | - self.proxy_class = ActiveSupport::Multibyte::Chars |
|---|
| 21 | + def self.proxy_class=(klass) |
|---|
| 22 | + @proxy_class = klass |
|---|
| 23 | + end |
|---|
| 24 | + |
|---|
| 25 | + # Returns the currect proxy class |
|---|
| 26 | + def self.proxy_class |
|---|
| 27 | + @proxy_class ||= ActiveSupport::Multibyte::Chars |
|---|
| 28 | + end |
|---|
| 29 | + |
|---|
| 30 | + # Regular expressions that describe valid byte sequences for a character |
|---|
| 31 | + VALID_CHARACTER = { |
|---|
| 32 | + # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) |
|---|
| 33 | + 'UTF-8' => /\A(?: |
|---|
| 34 | + [\x00-\x7f] | |
|---|
| 35 | + [\xc2-\xdf] [\x80-\xbf] | |
|---|
| 36 | + \xe0 [\xa0-\xbf] [\x80-\xbf] | |
|---|
| 37 | + [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | |
|---|
| 38 | + \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | |
|---|
| 39 | + [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | |
|---|
| 40 | + \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn, |
|---|
| 41 | + # Quick check for valid Shift-JIS characters, disregards the odd-even pairing |
|---|
| 42 | + 'Shift_JIS' => /\A(?: |
|---|
| 43 | + [\x00-\x7e \xa1-\xdf] | |
|---|
| 44 | + [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn |
|---|
| 45 | + } |
|---|
| 46 | end |
|---|
| 47 | end |
|---|
| 48 | + |
|---|
| 49 | +require 'active_support/multibyte/chars' |
|---|
| 50 | +require 'active_support/multibyte/exceptions' |
|---|
| 51 | +require 'active_support/multibyte/unicode_database' |
|---|
| 52 | +require 'active_support/multibyte/utils' |
|---|
| 53 | diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb |
|---|
| 54 | index 3d392d2..16bc130 100644 |
|---|
| 55 | --- a/activesupport/lib/active_support/multibyte/chars.rb |
|---|
| 56 | +++ b/activesupport/lib/active_support/multibyte/chars.rb |
|---|
| 57 | @@ -73,16 +73,7 @@ module ActiveSupport #:nodoc: |
|---|
| 58 | UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ |
|---|
| 59 | UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ |
|---|
| 60 | |
|---|
| 61 | - # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) |
|---|
| 62 | - UTF8_PAT = /\A(?: |
|---|
| 63 | - [\x00-\x7f] | |
|---|
| 64 | - [\xc2-\xdf] [\x80-\xbf] | |
|---|
| 65 | - \xe0 [\xa0-\xbf] [\x80-\xbf] | |
|---|
| 66 | - [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | |
|---|
| 67 | - \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | |
|---|
| 68 | - [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | |
|---|
| 69 | - \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] |
|---|
| 70 | - )*\z/xn |
|---|
| 71 | + UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] |
|---|
| 72 | |
|---|
| 73 | attr_reader :wrapped_string |
|---|
| 74 | alias to_s wrapped_string |
|---|
| 75 | @@ -307,23 +298,23 @@ module ActiveSupport #:nodoc: |
|---|
| 76 | def rstrip |
|---|
| 77 | chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) |
|---|
| 78 | end |
|---|
| 79 | - |
|---|
| 80 | + |
|---|
| 81 | # Strips entire range of Unicode whitespace from the left of the string. |
|---|
| 82 | def lstrip |
|---|
| 83 | chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) |
|---|
| 84 | end |
|---|
| 85 | - |
|---|
| 86 | + |
|---|
| 87 | # Strips entire range of Unicode whitespace from the right and left of the string. |
|---|
| 88 | def strip |
|---|
| 89 | rstrip.lstrip |
|---|
| 90 | end |
|---|
| 91 | - |
|---|
| 92 | + |
|---|
| 93 | # Returns the number of codepoints in the string |
|---|
| 94 | def size |
|---|
| 95 | self.class.u_unpack(@wrapped_string).size |
|---|
| 96 | end |
|---|
| 97 | alias_method :length, :size |
|---|
| 98 | - |
|---|
| 99 | + |
|---|
| 100 | # Reverses all characters in the string. |
|---|
| 101 | # |
|---|
| 102 | # Example: |
|---|
| 103 | @@ -331,7 +322,7 @@ module ActiveSupport #:nodoc: |
|---|
| 104 | def reverse |
|---|
| 105 | chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*')) |
|---|
| 106 | end |
|---|
| 107 | - |
|---|
| 108 | + |
|---|
| 109 | # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that |
|---|
| 110 | # character. |
|---|
| 111 | # |
|---|
| 112 | @@ -646,7 +637,7 @@ module ActiveSupport #:nodoc: |
|---|
| 113 | string.split(//u).map do |c| |
|---|
| 114 | c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding) |
|---|
| 115 | |
|---|
| 116 | - if !UTF8_PAT.match(c) |
|---|
| 117 | + if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) |
|---|
| 118 | n = c.unpack('C')[0] |
|---|
| 119 | n < 128 ? n.chr : |
|---|
| 120 | n < 160 ? [UCD.cp1252[n] || n].pack('U') : |
|---|
| 121 | diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb |
|---|
| 122 | new file mode 100644 |
|---|
| 123 | index 0000000..acef84d |
|---|
| 124 | --- /dev/null |
|---|
| 125 | +++ b/activesupport/lib/active_support/multibyte/utils.rb |
|---|
| 126 | @@ -0,0 +1,61 @@ |
|---|
| 127 | +# encoding: utf-8 |
|---|
| 128 | + |
|---|
| 129 | +module ActiveSupport #:nodoc: |
|---|
| 130 | + module Multibyte #:nodoc: |
|---|
| 131 | + if Kernel.const_defined?(:Encoding) |
|---|
| 132 | + # Returns a regular expression that matches valid characters in the current encoding |
|---|
| 133 | + def self.valid_character |
|---|
| 134 | + VALID_CHARACTER[Encoding.default_internal.to_s] |
|---|
| 135 | + end |
|---|
| 136 | + else |
|---|
| 137 | + def self.valid_character |
|---|
| 138 | + case $KCODE |
|---|
| 139 | + when 'UTF8' |
|---|
| 140 | + VALID_CHARACTER['UTF-8'] |
|---|
| 141 | + when 'SJIS' |
|---|
| 142 | + VALID_CHARACTER['Shift_JIS'] |
|---|
| 143 | + end |
|---|
| 144 | + end |
|---|
| 145 | + end |
|---|
| 146 | + |
|---|
| 147 | + if 'string'.respond_to?(:valid_encoding?) |
|---|
| 148 | + # Verifies the encoding of a string |
|---|
| 149 | + def self.verify(string) |
|---|
| 150 | + string.valid_encoding? |
|---|
| 151 | + end |
|---|
| 152 | + else |
|---|
| 153 | + def self.verify(string) |
|---|
| 154 | + if expression = valid_character |
|---|
| 155 | + for c in string.split(//) |
|---|
| 156 | + return false unless valid_character.match(c) |
|---|
| 157 | + end |
|---|
| 158 | + end |
|---|
| 159 | + true |
|---|
| 160 | + end |
|---|
| 161 | + end |
|---|
| 162 | + |
|---|
| 163 | + # Verifies the encoding of the string and raises an exception when it's not valid |
|---|
| 164 | + def self.verify!(string) |
|---|
| 165 | + raise EncodingError.new("Found characters with invalid encoding") unless verify(string) |
|---|
| 166 | + end |
|---|
| 167 | + |
|---|
| 168 | + if 'string'.respond_to?(:force_encoding) |
|---|
| 169 | + # Removes all invalid characters from the string. |
|---|
| 170 | + # |
|---|
| 171 | + # Note: this method is a no-op in Ruby 1.9 |
|---|
| 172 | + def self.clean(string) |
|---|
| 173 | + string |
|---|
| 174 | + end |
|---|
| 175 | + else |
|---|
| 176 | + def self.clean(string) |
|---|
| 177 | + if expression = valid_character |
|---|
| 178 | + stripped = []; for c in string.split(//) |
|---|
| 179 | + stripped << c if valid_character.match(c) |
|---|
| 180 | + end; stripped.join |
|---|
| 181 | + else |
|---|
| 182 | + string |
|---|
| 183 | + end |
|---|
| 184 | + end |
|---|
| 185 | + end |
|---|
| 186 | + end |
|---|
| 187 | +end |
|---|
| 188 | \ No newline at end of file |
|---|
| 189 | diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb |
|---|
| 190 | new file mode 100644 |
|---|
| 191 | index 0000000..d8ac5ff |
|---|
| 192 | --- /dev/null |
|---|
| 193 | +++ b/activesupport/test/multibyte_utils_test.rb |
|---|
| 194 | @@ -0,0 +1,141 @@ |
|---|
| 195 | +# encoding: utf-8 |
|---|
| 196 | + |
|---|
| 197 | +require 'abstract_unit' |
|---|
| 198 | +require 'multibyte_test_helpers' |
|---|
| 199 | + |
|---|
| 200 | +class MultibyteUtilsTest < ActiveSupport::TestCase |
|---|
| 201 | + include MultibyteTestHelpers |
|---|
| 202 | + |
|---|
| 203 | + test "valid_character returns an expression for the current encoding" do |
|---|
| 204 | + with_encoding('None') do |
|---|
| 205 | + assert_nil ActiveSupport::Multibyte.valid_character |
|---|
| 206 | + end |
|---|
| 207 | + with_encoding('UTF8') do |
|---|
| 208 | + assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character |
|---|
| 209 | + end |
|---|
| 210 | + with_encoding('SJIS') do |
|---|
| 211 | + assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character |
|---|
| 212 | + end |
|---|
| 213 | + end |
|---|
| 214 | + |
|---|
| 215 | + test "verify verifies ASCII strings are properly encoded" do |
|---|
| 216 | + with_encoding('None') do |
|---|
| 217 | + examples.each do |example| |
|---|
| 218 | + assert ActiveSupport::Multibyte.verify(example) |
|---|
| 219 | + end |
|---|
| 220 | + end |
|---|
| 221 | + end |
|---|
| 222 | + |
|---|
| 223 | + test "verify verifies UTF-8 strings are properly encoded" do |
|---|
| 224 | + with_encoding('UTF8') do |
|---|
| 225 | + assert ActiveSupport::Multibyte.verify(example('valid UTF-8')) |
|---|
| 226 | + assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8')) |
|---|
| 227 | + end |
|---|
| 228 | + end |
|---|
| 229 | + |
|---|
| 230 | + test "verify verifies Shift-JIS strings are properly encoded" do |
|---|
| 231 | + with_encoding('SJIS') do |
|---|
| 232 | + assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS')) |
|---|
| 233 | + assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS')) |
|---|
| 234 | + end |
|---|
| 235 | + end |
|---|
| 236 | + |
|---|
| 237 | + test "verify! raises an exception when it finds an invalid character" do |
|---|
| 238 | + with_encoding('UTF8') do |
|---|
| 239 | + assert_raises(ActiveSupport::Multibyte::EncodingError) do |
|---|
| 240 | + ActiveSupport::Multibyte.verify!(example('invalid UTF-8')) |
|---|
| 241 | + end |
|---|
| 242 | + end |
|---|
| 243 | + end |
|---|
| 244 | + |
|---|
| 245 | + test "verify! doesn't raise an exception when the encoding is valid" do |
|---|
| 246 | + with_encoding('UTF8') do |
|---|
| 247 | + assert_nothing_raised do |
|---|
| 248 | + ActiveSupport::Multibyte.verify!(example('valid UTF-8')) |
|---|
| 249 | + end |
|---|
| 250 | + end |
|---|
| 251 | + end |
|---|
| 252 | + |
|---|
| 253 | + if RUBY_VERSION < '1.9' |
|---|
| 254 | + test "clean leaves ASCII strings intact" do |
|---|
| 255 | + with_encoding('None') do |
|---|
| 256 | + [ |
|---|
| 257 | + 'word', "\270\236\010\210\245" |
|---|
| 258 | + ].each do |string| |
|---|
| 259 | + assert_equal string, ActiveSupport::Multibyte.clean(string) |
|---|
| 260 | + end |
|---|
| 261 | + end |
|---|
| 262 | + end |
|---|
| 263 | + |
|---|
| 264 | + test "clean cleans invalid characters from UTF-8 encoded strings" do |
|---|
| 265 | + with_encoding('UTF8') do |
|---|
| 266 | + cleaned_utf8 = [8].pack('C*') |
|---|
| 267 | + assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8')) |
|---|
| 268 | + assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8')) |
|---|
| 269 | + end |
|---|
| 270 | + end |
|---|
| 271 | + |
|---|
| 272 | + test "clean cleans invalid characters from Shift-JIS encoded strings" do |
|---|
| 273 | + with_encoding('SJIS') do |
|---|
| 274 | + cleaned_sjis = [184, 0, 136, 165].pack('C*') |
|---|
| 275 | + assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS')) |
|---|
| 276 | + assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) |
|---|
| 277 | + end |
|---|
| 278 | + end |
|---|
| 279 | + else |
|---|
| 280 | + test "clean is a no-op" do |
|---|
| 281 | + with_encoding('UTF8') do |
|---|
| 282 | + assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) |
|---|
| 283 | + end |
|---|
| 284 | + end |
|---|
| 285 | + end |
|---|
| 286 | + |
|---|
| 287 | + private |
|---|
| 288 | + |
|---|
| 289 | + STRINGS = { |
|---|
| 290 | + 'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'), |
|---|
| 291 | + 'invalid ASCII' => [128].pack('C*'), |
|---|
| 292 | + 'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'), |
|---|
| 293 | + 'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'), |
|---|
| 294 | + 'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'), |
|---|
| 295 | + 'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*') |
|---|
| 296 | + } |
|---|
| 297 | + |
|---|
| 298 | + if Kernel.const_defined?(:Encoding) |
|---|
| 299 | + def example(key) |
|---|
| 300 | + STRINGS[key].force_encoding(Encoding.default_internal) |
|---|
| 301 | + end |
|---|
| 302 | + |
|---|
| 303 | + def examples |
|---|
| 304 | + STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) } |
|---|
| 305 | + end |
|---|
| 306 | + else |
|---|
| 307 | + def example(key) |
|---|
| 308 | + STRINGS[key] |
|---|
| 309 | + end |
|---|
| 310 | + |
|---|
| 311 | + def examples |
|---|
| 312 | + STRINGS.values |
|---|
| 313 | + end |
|---|
| 314 | + end |
|---|
| 315 | + |
|---|
| 316 | + if 'string'.respond_to?(:encoding) |
|---|
| 317 | + def with_encoding(enc) |
|---|
| 318 | + before = Encoding.default_internal |
|---|
| 319 | + |
|---|
| 320 | + case enc |
|---|
| 321 | + when 'UTF8' |
|---|
| 322 | + Encoding.default_internal = Encoding::UTF_8 |
|---|
| 323 | + when 'SJIS' |
|---|
| 324 | + Encoding.default_internal = Encoding::Shift_JIS |
|---|
| 325 | + else |
|---|
| 326 | + Encoding.default_internal = Encoding::BINARY |
|---|
| 327 | + end |
|---|
| 328 | + yield |
|---|
| 329 | + |
|---|
| 330 | + Encoding.default_internal = before |
|---|
| 331 | + end |
|---|
| 332 | + else |
|---|
| 333 | + alias with_encoding with_kcode |
|---|
| 334 | + end |
|---|
| 335 | +end |
|---|
| 336 | \ No newline at end of file |
|---|
| 337 | |
|---|