Skip to content

Commit

Permalink
filter out blanks and spaces in learn vocab trigrams
Browse files Browse the repository at this point in the history
  • Loading branch information
travisbrady committed Dec 13, 2013
1 parent 6a2e2a5 commit 31d58ea
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 2 deletions.
3 changes: 2 additions & 1 deletion clean.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
import re
from sys import stdin
from json import loads
Expand All @@ -11,7 +12,7 @@
j = loads(line)
s = j['text']
s = hp.unescape(s)
s = s.lower()
s = s.lower().encode('utf-8')
s = s.replace("'", '')
s = s.replace("’", '')
s = s.replace('_', ' ')
Expand Down
5 changes: 4 additions & 1 deletion ocaml/lib_test/learn_vocab_tri.ml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ let () =
In_channel.iter_lines stdin ~f:(fun line ->
incr line_no;
if !line_no mod 100_000 = 0 then eprintf "Line: %d\n%!" !line_no;
let words = String.split line ~on:' ' in
let words = String.split line ~on:' '
|> List.filter ~f:(fun x -> x <> "" && x <> " ")
in
let n_words = List.length words in
total_words := !total_words + n_words;
triloop h words;
Expand All @@ -44,3 +46,4 @@ let () =
printf "%d\n" !total_words;
List.iter al ~f:(fun (x, y) -> printf "%s,%d\n" x y)


0 comments on commit 31d58ea

Please sign in to comment.