Skip to content

Commit

Permalink
reduce lenencode bits (#232)
Browse files Browse the repository at this point in the history
  • Loading branch information
benibela committed Dec 17, 2021
1 parent 462093b commit 8a4cd4c
Show file tree
Hide file tree
Showing 3 changed files with 7,076 additions and 7,065 deletions.
8 changes: 4 additions & 4 deletions data/data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,13 @@ def cpary2c(array)
return "UINT16_MAX" if array.nil? || array.length == 0
lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
array = cpary2utf16encoded(array)
if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
if lencode >= 3 #we have only 2 bits for the length
array = [lencode] + array
lencode = 7
lencode = 3
end
idx = pushary(array)
raise "Array index out of bound" if idx > 0x1FFF
return "#{idx | (lencode << 13)}"
raise "Array index out of bound" if idx > 0x3FFF
return "#{idx | (lencode << 14)}"
end
def singlecpmap(cp)
return "UINT16_MAX" if cp == nil
Expand Down
6 changes: 3 additions & 3 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -356,9 +356,9 @@ static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)

static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
utf8proc_ssize_t written = 0;
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
int len = seqindex >> 13;
if (len >= 7) {
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x3FFF];
int len = seqindex >> 14;
if (len >= 3) {
len = *entry;
entry++;
}
Expand Down
Loading

0 comments on commit 8a4cd4c

Please sign in to comment.