Skip to content

Commit

Permalink
tags: Fix issues in the perfect hash implementation
Browse files Browse the repository at this point in the history
The result of the perfect hash for tag names was not being properly
compared against the candidate string, as no length checks were being
taken into account.

Hence, looking up `t` would match `textarea` because both strings hashed
to the same value (by coincidence), and yet only the first byte of the
strings was being compared for a full match.

The issue has been fixed by adding a table of tag lengths, which will
additionally speed up the rejection of invalid strings.

In order to simplify the generation of the several automated Tag tables,
and to remove the depedency on `sed` from the Makefile, a simple
`gentags.py` has been added to the codebase.

Running `make gentags` will generate all the tables **and** the perfect
hash function, assuming that Python and the correct version of MPH is in
the path.

An updated version of mph has been pushed to the following repository:

	https://github.com/vmg/mph

It contains all the changes required to generate case-insensitive hashes
just like the ones used in the library, with no further modification to
the hash output.

Conflicts:
	src/tag.c
  • Loading branch information
vmg committed Feb 27, 2015
1 parent 69b580a commit e5f0f84
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 15 deletions.
13 changes: 5 additions & 8 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,9 @@ clean-local:

endif !HAVE_SHARED_LIBGTEST

src/tag_strings.h: src/tag.in
@sed 's/\(.*\)/"\1",/g' <$< >$@

src/tag_enum.h: src/tag.in
@sed 's/\(.*\)/GUMBO_TAG_\U\1,/g;s/-/_/g' <$< >$@

python/gumbo/gumboc_tags.py: src/tag.in
@sed -e '1i TagNames = [' -e 's/\(.*\)/\t"\U\1",/g' -e 's/-/_/g' -e "\$$a]" <$< >$@
gentags: src/tag.in
@python gentags.py $<
@mph -d2 -m2 -c1.33 <$< | emitc -s -l -i >src/tag_perf.h

lib_LTLIBRARIES = libgumbo.la
libgumbo_la_CFLAGS = -Wall
Expand All @@ -65,7 +60,9 @@ libgumbo_la_SOURCES = \
src/string_piece.h \
src/tag.c \
src/tag_enum.h \
src/tag_perf.h \
src/tag_strings.h \
src/tag_sizes.h \
src/token_type.h \
src/tokenizer.c \
src/tokenizer.h \
Expand Down
27 changes: 27 additions & 0 deletions gentags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import sys

tag_strings = open("src/tag_strings.h", "w")
tag_enum = open("src/tag_enum.h", "w")
tag_sizes = open("src/tag_sizes.h", "w")

tag_py = open("python/gumbo/gumboc_tags.py", "w")
tag_py.write('TagNames = [\n')

tagfile = open(sys.argv[1])

for tag in tagfile:
tag = tag.strip()
tag_upper = tag.upper().replace('-', '_')
tag_strings.write('"%s",\n' % tag)
tag_enum.write('GUMBO_TAG_%s,\n' % tag_upper)
tag_sizes.write('%d, ' % len(tag))
tag_py.write('\t"%s",\n' % tag_upper)

tagfile.close()

tag_strings.close()
tag_enum.close()
tag_sizes.close()

tag_py.write(']\n')
tag_py.close()
11 changes: 10 additions & 1 deletion src/tag.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <assert.h>
#include <ctype.h>
#include <stdint.h>
#include <strings.h> // For strcasecmp.
#include <string.h> // For strcasecmp.

Expand All @@ -27,6 +28,12 @@ const char* kGumboTagNames[] = {
"", // TAG_LAST
};

static const uint8_t kGumboTagSizes[] = {
# include "tag_sizes.h"
0, // TAG_UNKNOWN
0, // TAG_LAST
};

const char* gumbo_normalized_tagname(GumboTag tag) {
assert(tag <= GUMBO_TAG_LAST);
return kGumboTagNames[tag];
Expand Down Expand Up @@ -76,7 +83,9 @@ case_memcmp(const char *s1, const char *s2, int n)

GumboTag gumbo_tagn_enum(const char* tagname, int length) {
int position = perfhash((const unsigned char *)tagname, length);
if (position >= 0 && !case_memcmp(tagname, kGumboTagNames[position], length))
if (position >= 0 &&
length == kGumboTagSizes[position] &&
!case_memcmp(tagname, kGumboTagNames[position], length))
return (GumboTag)position;
return GUMBO_TAG_UNKNOWN;
}
Expand Down
15 changes: 9 additions & 6 deletions src/tag_perf.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ static int T0[] = {
41, 63, 101, 63, 112, 96, 146, 90, 5, 132,
153, 95, 32, 15, 7, 80, 26, 57, 103, 191,
83, 126, 134, 169, 55, 90, 55, 74, 58, 69,
5, 99, 132, 58,
5, 99, 132, 58,
};

static int T1[] = {
Expand All @@ -71,11 +71,14 @@ static int T1[] = {
84, 156, 187, 18, 18, 119, 79, 169, 168, 148,
88, 0, 122, 3, 169, 88, 139, 146, 88, 144,
86, 148, 5, 150, 17, 105, 81, 137, 98, 113,
120, 182, 69, 107,
120, 182, 69, 107,
};

static int
perfhash(const unsigned char *key, int len)
#ifndef perfhash_tolower
#define perfhash_tolower(c) tolower(c)
#endif

static int perfhash(const unsigned char *key, int len)
{
int i;
int n;
Expand All @@ -85,9 +88,9 @@ perfhash(const unsigned char *key, int len)
if (len < 1 || len > 14)
return -1;

for (i=-45, n=0, f0=f1=0; n < len; ++n) {
for (i=-45, f0=f1=0, n=0; n<len; ++n) {
int c = kp[n];
c = tolower(c);
c = perfhash_tolower(c);
if (c < 45 || c > 121)
return -1;
f0 += T0[i + c];
Expand Down
1 change: 1 addition & 0 deletions src/tag_sizes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
4, 4, 5, 4, 4, 4, 5, 6, 8, 8, 4, 7, 7, 3, 5, 2, 2, 2, 2, 2, 2, 6, 6, 6, 7, 1, 2, 3, 10, 2, 2, 2, 2, 2, 2, 6, 10, 4, 3, 1, 2, 6, 5, 1, 4, 1, 3, 4, 4, 4, 4, 3, 4, 3, 3, 3, 1, 1, 1, 4, 4, 2, 2, 3, 3, 4, 2, 3, 3, 3, 5, 3, 6, 5, 6, 5, 5, 5, 6, 5, 6, 3, 4, 4, 2, 2, 2, 2, 5, 6, 10, 14, 3, 13, 4, 5, 7, 8, 3, 5, 5, 5, 2, 2, 2, 4, 8, 6, 5, 5, 6, 6, 8, 8, 6, 8, 6, 6, 8, 5, 7, 7, 4, 8, 6, 7, 7, 3, 5, 8, 8, 7, 7, 3, 6, 7, 9, 2, 6, 8, 3, 5, 6, 4, 7, 8, 4, 6, 2, 3,

0 comments on commit e5f0f84

Please sign in to comment.