Changeset 3526
- Timestamp:
- 02/13/06 16:52:01 (2 years ago)
- Files:
-
- incoming/trunk/lib/rex/text.rb (modified) (2 diffs)
- incoming/trunk/lib/rex/text.rb.ut.rb (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
incoming/trunk/lib/rex/text.rb
r3524 r3526 116 116 117 117 # 118 # Converts standard ASCII text to 16-bit unicode 119 # 120 # By default, little-endian unicode. By providing non-nil value for 121 # endian, convert to 16-bit big-endian unicode. NOTE, most systems require 122 # a marker to specify that the unicode text being provided is in 123 # big-endian. Use 0xFEFF, which is not a "legal" unicode code point. 124 # 125 def self.to_unicode(str='', mode = 'utf-16le') 126 case mode 127 when 'utf-16le' 128 return str.unpack('C*').pack('v*') 129 when 'utf-16be' 130 return str.unpack('C*').pack('n*') 131 when 'utf-32le' 132 return str.unpack('C*').pack('V*') 133 when 'utf-32be' 134 return str.unpack('C*').pack('N*') 135 when 'utf-7' 118 # Converts standard ASCII text to a unicode string. 119 # 120 # Supported unicode types include: utf-16le, utf16-be, utf32-le, utf32-be, utf-7, and utf-8 121 # 122 # Providing 'mode' provides hints to the actual encoder as to how it should encode the string. Only UTF-7 and UTF-8 use "mode". 123 # 124 # utf-7 by default does not encode alphanumeric and a few other characters. By specifying the mode of "all", then all of the characters are encoded, not just the non-alphanumeric set. 125 # to_unicode(str, 'utf-7', 'all') 126 # 127 # utf-8 specifies that alphanumeric characters are used directly, eg "a" is just "a". However, there exist 6 different overlong encodings of "a" that are technically not valid, but parse just fine in most utf-8 parsers. (0xC1A1, 0xE081A1, 0xF08081A1, 0xF8808081A1, 0xFC80808081A1, 0xFE8080808081A1). How many bytes to use for the overlong enocding is specified providing 'size'. 128 # to_unicode(str, 'utf-8', 'overlong', 2) 129 # 130 # Many utf-8 parsers also allow invalid overlong encodings, where bits that are unused when encoding a single byte are modified. Many parsers will ignore these bits, rendering simple string matching to be ineffective for dealing with UTF-8 strings. There are many more invalid overlong encodings possible for "a". For example, three encodings are available for an invalid 2 byte encoding of "a". (0xC1E1 0xC161 0xC121). By specifying "invalid", a random invalid encoding is chosen for the given byte size. 131 # to_unicode(str, 'utf-8', 'invalid', 2) 132 # 133 # utf-7 defaults to 'normal' utf-7 encoding 134 # utf-8 defaults to 2 byte 'normal' encoding 135 # 136 def self.to_unicode(str='', type = 'utf-16le', mode = '', size = 2) 137 case type 138 when 'utf-16le' 139 return str.unpack('C*').pack('v*') 140 when 'utf-16be' 141 return str.unpack('C*').pack('n*') 142 when 'utf-32le' 143 return str.unpack('C*').pack('V*') 144 when 'utf-32be' 145 return str.unpack('C*').pack('N*') 146 when 'utf-7' 147 case mode 148 when 'all' 149 return str.gsub(/./){ |a| 150 out = '' 151 if 'a' != '+' 152 out = encode_base64(to_unicode(a, 'utf-16be')).gsub(/[=\r\n]/, '') 153 end 154 '+' + out + '-' 155 } 156 else 136 157 return str.gsub(/[^\n\r\t\ A-Za-z0-9\'\(\),-.\/\:\?]/){ |a| 137 158 out = '' … … 141 162 '+' + out + '-' 142 163 } 143 when 'utf-7-all' 144 return str.gsub(/./){ |a| 145 out = '' 146 if 'a' != '+' 147 out = encode_base64(to_unicode(a, 'utf-16be')).gsub(/[=\r\n]/, '') 164 end 165 when 'utf-8' 166 if size >= 2 and size <= 7 167 string = '' 168 str.each_byte { |a| 169 if a > 0x7f || mode != '' 170 # ugh. turn a single byte into the binary representation of it, in array form 171 bin = [a].pack('C').unpack('B8')[0].split(//) 172 173 # even more ugh. 174 bin.collect!{|a| a = a.to_i} 175 176 out = Array.new(8 * size, 0) 177 178 0.upto(size - 1) { |i| 179 out[i] = 1 180 out[i * 8] = 1 181 } 182 183 i = 0 184 byte = 0 185 bin.reverse.each { |bit| 186 if i < 6 187 mod = (((size * 8) - 1) - byte * 8) - i 188 out[mod] = bit 189 else 190 byte = byte + 1 191 i = 0 192 redo 193 end 194 i = i + 1 195 } 196 197 if mode != '' 198 case mode 199 when 'overlong' 200 # do nothing, since we already handle this as above... 201 when 'invalid' 202 done = 0 203 while done == 0 204 bits = [7, 8, 15, 16, 23, 24, 31, 32, 41] 205 bits.each { |bit| 206 bit = (size * 8) - bit 207 if bit > 1 208 set = rand(2) 209 if out[bit] != set 210 out[bit] = set 211 done = 1 212 end 213 end 214 } 215 end 216 else 217 raise TypeError, 'Invalid mode. Only "overlong" and "invalid" are acceptable modes for utf-8' 218 end 219 end 220 string += [out.join('')].pack('B*') 221 else 222 string += [a].pack('C') 148 223 end 149 '+' + out + '-'150 224 } 225 return string 151 226 else 152 raise TypeError, 'invalid utf type' 153 end 227 raise TypeError, 'invalid utf-8 size' 228 end 229 else 230 raise TypeError, 'invalid utf type' 231 end 154 232 end 155 233 incoming/trunk/lib/rex/text.rb.ut.rb
r3524 r3526 24 24 assert_equal("\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c", Rex::Text.to_unicode('abc', 'utf-32be'), 'utf-32be') 25 25 assert_equal("abc+-abc-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7'), 'utf-7') 26 assert_equal("+AGE-+AGI-+AGM-+ACs-+AGE-+AGI-+AGM-+AC0-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7-all'), 'utf-7-all') 26 assert_equal("+AGE-+AGI-+AGM-+ACs-+AGE-+AGI-+AGM-+AC0-+AAA-", Rex::Text.to_unicode("abc+abc-\x00", 'utf-7', 'all'), 'utf-7-all') 27 28 assert_equal("a\303\272", Rex::Text.to_unicode("a\xFA", 'utf-8')) 29 assert_equal("\xC1\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 2), 'utf-8 overlong') 30 assert_equal("\xE0\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 3), 'utf-8 overlong') 31 assert_equal("\xF0\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 4), 'utf-8 overlong') 32 assert_equal("\xF8\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 5), 'utf-8 overlong') 33 assert_equal("\xFC\x80\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 6), 'utf-8 overlong') 34 assert_equal("\xFE\x80\x80\x80\x80\x81\xA1", Rex::Text.to_unicode('a', 'utf-8', 'overlong', 7), 'utf-8 overlong') 35 100.times { 36 assert(["\xC1\x21","\xC1\x61","\xC1\xE1"].include?(Rex::Text.to_unicode('a', 'utf-8', 'invalid')), 'utf-8 invalid') 37 assert(["\xE0\x01\x21","\xE0\x01\x61","\xE0\x01\xA1","\xE0\x01\xE1","\xE0\x41\x21","\xE0\x41\x61","\xE0\x41\xA1","\xE0\x41\xE1","\xE0\x81\x21","\xE0\x81\x61","\xE0\x81\xA1","\xE0\x81\xE1","\xE0\xC1\x21","\xE0\xC1\x61","\xE0\xC1\xA1","\xE0\xC1\xE1"].include?(Rex::Text.to_unicode('a', 'utf-8', 'invalid', 3)), 'utf-8 invalid 3 byte') 38 } 39 40 assert_raises(TypeError) { 41 Rex::Text.to_unicode('a', 'utf-8', '', 8) 42 } 43 assert_raises(TypeError) { 44 Rex::Text.to_unicode('a', 'utf-8', 'foo', 6) 45 } 27 46 end 28 47
