forked from Jezza/mutf8
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mutf8.rs
203 lines (177 loc) · 4.83 KB
/
mutf8.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
use std::borrow::Cow;
pub fn utf8_to_mutf8(input: &[u8]) -> Cow<[u8]> {
let len = input.len();
if len == 0 {
return Cow::Borrowed(input);
}
const MODE_BORROW: u8 = 0;
const MODE_COPY: u8 = 1;
let mut mode = MODE_BORROW;
let mut data = vec![];
let mut i = 0;
while i < len {
let mark = i;
let byte1 = unsafe { *input.get_unchecked(i) };
i += 1;
// nul bytes and bytes starting with 11110xxx are somewhat special
if byte1 & 0x80 == 0 {
// 1-byte encoding
if byte1 == 0 {
if mode == MODE_BORROW {
mode = MODE_COPY;
let run = &input[0..mark];
data.extend(run);
}
data.push(0xC0);
data.push(0x80);
} else if mode == MODE_COPY {
data.push(byte1);
}
} else if byte1 & 0xE0 == 0xC0 {
// 2-byte encoding
if mode == MODE_COPY {
data.push(byte1);
let byte2 = *input.get(i).unwrap_or(&0);
i += 1;
data.push(byte2);
}
} else if byte1 & 0xF0 == 0xE0 {
// 3-byte encoding
if mode == MODE_COPY {
data.push(byte1);
let byte2 = *input.get(i).unwrap_or(&0);
i += 1;
data.push(byte2);
let byte3 = *input.get(i).unwrap_or(&0);
i += 1;
data.push(byte3);
}
} else if byte1 & 0xF8 == 0xF0 {
if mode == MODE_BORROW {
mode = MODE_COPY;
let run = &input[0..mark];
data.extend(run);
}
// Beginning of 4-byte encoding, turn into 2 3-byte encodings
// Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
let byte2 = *input.get(i).unwrap_or(&0);
i += 1;
let byte3 = *input.get(i).unwrap_or(&0);
i += 1;
let byte4 = *input.get(i).unwrap_or(&0);
i += 1;
// Reconstruct full 21-bit value
let mut bits: u32 = ((byte1 as u32) & 0x07) << 18;
bits += ((byte2 as u32) & 0x3F) << 12;
bits += ((byte3 as u32) & 0x3F) << 6;
bits += (byte4 as u32) & 0x3F;
// Bits out: 11101101 1010xxxx 10xxxxxx
data.push(0xED);
data.push((0xA0 + (((bits >> 16) - 1) & 0x0F)) as u8);
data.push((0x80 + ((bits >> 10) & 0x3F)) as u8);
// Bits out: 11101101 1011xxxx 10xxxxxx
data.push(0xED);
data.push((0xB0 + ((bits >> 6) & 0x0F)) as u8);
data.push(byte4);
}
}
if mode == MODE_BORROW {
Cow::Borrowed(input)
} else {
Cow::Owned(data)
}
}
pub fn mutf8_to_utf8(input: &[u8]) -> Cow<[u8]> {
let len = input.len();
if len == 0 {
return Cow::Borrowed(input);
}
const MODE_BORROW: u8 = 0;
const MODE_COPY: u8 = 1;
let mut mode = MODE_BORROW;
let mut data = vec![];
let mut i = 0;
while i < len {
let mark = i;
let byte1 = unsafe { *input.get_unchecked(i) };
i += 1;
if byte1 & 0x80 == 0 {
// 1 byte encoding
if mode == MODE_BORROW {
// Nothing to do here as it's valid ascii/utf-8.
continue;
}
data.push(byte1);
} else if byte1 & 0xE0 == 0xC0 {
// 2 byte encoding
// Mask out the three bits so we can check if it's equal to the marker bits that say this is a 2 byte encoding.
// 0b11100000 = 0xE0
// 0b11000000 = 0xC0
let byte2 = *input.get(i).unwrap_or(&0);
i += 1;
// println!("Bytes: {:x} {:x}", byte1, byte2);
if byte1 != 0xC0 || byte2 != 0x80 {
if mode == MODE_BORROW {
// Nothing to do here as it's valid ascii/utf-8.
continue;
}
data.push(byte1);
data.push(byte2);
} else {
if mode == MODE_BORROW {
mode = MODE_COPY;
let run = &input[0..mark];
data.extend(run);
}
data.push(0);
}
} else if byte1 & 0xF0 == 0xE0 {
// 3 byte encoding
let byte2 = *input.get(i).unwrap_or(&0);
i += 1;
let byte3 = *input.get(i).unwrap_or(&0);
i += 1;
// println!("{:x} {:x} {:x}", byte1, byte2, byte3);
if i + 2 < len && byte1 == 0xED && byte2 & 0xF0 == 0xA0 {
// Check if pair encoding...
let byte4 = *input.get(i).unwrap_or(&0);
let byte5 = *input.get(i + 1).unwrap_or(&0);
let byte6 = *input.get(i + 2).unwrap_or(&0);
// println!("{:x} {:x} {:x}", byte4, byte5, byte6);
if byte4 == 0xED && byte5 & 0xF0 == 0xB0 {
// Bits in: 11101101 1010xxxx 10xxxxxx
// Bits in: 11101101 1011xxxx 10xxxxxx
i += 2;
let mut bits: u32 = (((byte2 as u32) & 0x0F) + 1) << 16;
bits += ((byte3 as u32) & 0x3F) << 10;
bits += ((byte5 as u32) & 0x0F) << 6;
bits += (byte6 as u32) & 0x3F;
// Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if mode == MODE_BORROW {
mode = MODE_COPY;
let run = &input[0..mark];
data.extend(run);
}
// Convert the bits into 4 UTF-8 bytes.
data.push((0xF0 + ((bits >> 18) & 0x07)) as u8);
data.push((0x80 + ((bits >> 12) & 0x3F)) as u8);
data.push((0x80 + ((bits >> 6) & 0x3F)) as u8);
data.push((0x80 + (bits & 0x3F)) as u8);
continue;
}
}
if mode == MODE_BORROW {
// Nothing to do here as it's valid ascii/utf-8.
continue;
}
data.push(byte1);
data.push(byte2);
data.push(byte3);
}
}
if mode == MODE_BORROW {
Cow::Borrowed(input)
} else {
Cow::Owned(data)
}
}