clean up the code a bit

Mononofu · Oct 1, 2019 · e8e85bd · e8e85bd
1 parent ff7f3d3
commit e8e85bd
Show file tree

Hide file tree

Showing 6 changed files with 12,519 additions and 10,525 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,6 @@
+# Kanji Poster
+
+
+## Dependencies
+
+We use [colour](https://pypi.org/project/colour/) to interpolate colors and [jaconv](https://pypi.org/project/jaconv/) to convert between hiragana and katakana for readings; you can install both from pip.
diff --git a/kanji_grid.tex b/kanji_grid.tex
diff --git a/list_kanji.py b/list_kanji.py
@@ -1,12 +1,13 @@
 # -*- coding: utf-8
+import argparse
 import collections
-import colour
 import csv
 import itertools
 import json
 import math
 import re
 
+import colour
 import jaconv
 
 class KanjiInfo(object):
@@ -19,18 +20,22 @@ def __init__(self, meaning, onyomi, kunyomi, wanikani_level=None, grade=None):
     self.frequency = None
     self.indices = {}
 
-_NUM_WANIKANI_LEVELS = 60
-
 def read_wanikanji():
   kanji_info = {}
 
-  for level in range(1, _NUM_WANIKANI_LEVELS + 1):
-    with open('/home/mononofu/Dropbox/eBooks/languages/japanese/WaniKani/with_similar_kanji/%02d_kanji.csv' % level) as f:
-      lines = f.read()
-    for line in lines.strip().split('\n'):
-      parts = line.split(';')
-      kanji, meaning, onyomi, kunyomi = parts[2], parts[3], parts[5], parts[6]
-      kanji_info[kanji] = KanjiInfo(meaning, onyomi, kunyomi, wanikani_level=int(level))
+  def first_reading(reading):
+    if not reading or reading in ['None', 'N/A']:
+      return None
+    return reading.split(',')[0]
+
+  with open('wanikani.csv') as f:
+    lines = f.read()
+  for line in lines.strip().split('\n'):
+    level, kanji, meaning, onyomi, kunyomi = line.split(';')
+    kanji_info[kanji] = KanjiInfo(meaning,
+                                  first_reading(onyomi),
+                                  first_reading(kunyomi),
+                                  wanikani_level=int(level))
 
   return kanji_info
 
@@ -40,7 +45,6 @@ def strip_link(wiki_text):
     raise ValueError("mismatch in: %s" % wiki_text)
   return a
 
-
 # Some kanji are outside the basic character set, instead these replacement
 # characters are used in practice (see
 # https://en.wikipedia.org/wiki/List_of_jōyō_kanji#List_of_characters for
@@ -53,6 +57,7 @@ def strip_link(wiki_text):
 }
 
 def merge_with_joyo(kanji_info):
+  # Dumped from https://en.wikipedia.org/wiki/List_of_jōyō_kanji.
   with open('joyo_kanji.txt') as f:
     wiki_joyo = f.read()
   for line in wiki_joyo.strip().split('\n')[8:]:
@@ -93,25 +98,26 @@ def add_frequency(kanji_info):
   if unseen_kanji:
     print('failed to find frequency info for: ', ', '.join(unseen_kanji))
 
+_SORT_INDICES = {
+    'heisig': 'Heisig RTK Index',
+    'rtk2': 'RTK2 Index',
+    '2k1K0': '2k1KO Index',
+    'opt_vocab_sort': 'Opt Vocab Sort Index',
+    'kanji_leaner_course': 'Kanji Learner Course Index',
+    'frequency': 'Freq.'
+}
 
 def add_sort_orders(kanji_info):
   # From https://docs.google.com/spreadsheets/d/19zorQpMJi00-b6abuvE5uBAIsMMqWVrbeHD-bIrkggQ/
   with open('kanken_heisig.csv') as f:
     reader = csv.DictReader(f)
     kanji_to_row = {row['Kanji']: row for row in reader}
 
-  indices = {
-    'heisig': 'Heisig RTK Index',
-    'rtk2': 'RTK2 Index',
-    '2k1K0': '2k1KO Index',
-    'opt_vocab_sort': 'Opt Vocab Sort Index',
-    'kanji_leaner_course': 'Kanji Learner Course Index',
-    'frequency': 'Freq.'
-  }
+
   for kanji, info in kanji_info.items():
     if kanji in kanji_to_row:
       row = kanji_to_row[kanji]
-      for label, column in indices.items():
+      for label, column in _SORT_INDICES.items():
         if row[column]:
           info.indices[label] = int(row[column])
 
@@ -136,49 +142,6 @@ def add_radicals(kanji_info):
     print('failed to find radicals for: ', ', '.join(unseen_kanji))
 
 
-def group_by_radicals(kanji_info):
-  seen = set()
-
-  predefined_radical_groups = ['辶']
-
-  # First take kanji who are their own radicals.
-  grouped_kanji = {r: [] for r in predefined_radical_groups}
-  for kanji, info in kanji_info.items():
-    if not info.radicals or info.radicals == [kanji]:
-      seen.add(kanji)
-      if kanji not in grouped_kanji:
-        grouped_kanji[kanji] = []
-      grouped_kanji[kanji].append(kanji)
-
-  for kanji, info in kanji_info.items():
-    if kanji in seen:
-      continue
-    for radical in info.radicals:
-      if radical in grouped_kanji:
-        seen.add(kanji)
-        grouped_kanji[radical].append(kanji)
-        break
-
-  for radical, kanjis in grouped_kanji.items():
-    print(radical, ':', ''.join(kanjis))
-
-  print('述', kanji_info['述'].radicals)
-
-  radical_to_kanji = collections.defaultdict(list)
-  for kanji, info in kanji_info.items():
-    for n in range(1, len(info.radicals) + 1):
-      for radical_subset in itertools.combinations(info.radicals, n):
-        radical_to_kanji[''.join(radical_subset)].append(kanji)
-
-  # seen = set()
-  # for radical_subset, kanjis in sorted(radical_to_kanji.items(), key=lambda kv: len(kv[0]), reverse=True):
-  #   unseen = [k for k in kanjis if k not in seen]
-  #   if len(unseen) > 1:
-  #     print(radical_subset, unseen)
-  #     for kanji in kanjis:
-  #       seen.add(kanji)
-
-
 # Some of the meanings are too long to fit in one line, so we replace them with
 # a shorter version.
 _MEANING_REPLACEMENTS = {
@@ -203,9 +166,6 @@ def group_by_radicals(kanji_info):
 }
 
 
-def is_set(text):
-  return text and text not in ['N/A', 'None']
-
 def color(text, c):
   return r'\textcolor[HTML]{%s}{%s}' % (c, text)
 
@@ -222,52 +182,77 @@ def choose_color(info):
   index = int((log_freq - min_freq) / (max_freq - min_freq) * len(_COLORS))
   return _COLORS[index].hex[1:]
 
-def generate_poster_tex(kanji_info, sort_by, minimal=False):
-  cell_size = 2.05
+def tikz_node(kind, x, y, text=''):
+  return "\\node[%s] at (%f, %f) {%s};" % (kind, x, y, text)
+
+def render_kanji(kanji, info, x, y, minimal):
+  """Renders a kanji and related information at the specified xy position."""
+  nodes = []
+
+  def add_node(kind, dx, dy, text=''):
+    """Adds a tikz node with the specified offset from the center."""
+    nodes.append(tikz_node(kind, x + dx, y + dy, text))
+
+  add_node('Kanji', 0, 0.5, color(kanji, choose_color(info)))
+
+  if not minimal:
+    add_node('Square', 0, 0)
+
+    if info.onyomi:
+      add_node('Onyomi', 0.05, 0.1, jaconv.hira2kata(info.onyomi))
+
+    if info.kunyomi:
+      add_node('Kunyomi', -0.05, 0.1, info.kunyomi)
+
+    meaning = info.meaning.split(',')[0]
+    if kanji in _MEANING_REPLACEMENTS:
+      meaning = _MEANING_REPLACEMENTS[kanji]
+    add_node('Meaning', 0, 1.75, meaning)
+
+  return nodes
+
+def generate_poster_tex(kanji_info, sort_by, minimal=False, first_n=None):
+  """Generates Tex to render all kanji in kanji_info in a big poster."""
+  sorted_info = sorted(kanji_info.items(), key=lambda kv: sort_by(kv[1]))
+  if first_n:
+    sorted_info = sorted_info[:first_n]
+
+  # The center of the poster is at (0, 0). Since we are using an A0 landscape
+  # poster, the total width is 118.9 and the height 84.1, so the top left corner
+  # is at roughly x=59.4 and y=42.
+  # TODO: We could derive cell sizes and column/row count automatically based on
+  # the number of Kanji we want to show.
+  cell_size = 2.05  # Must match \Size in main.tex.
   num_cols = 56
 
-  tex = ""
+  def x(col):
+    return cell_size * col - 56
+
+  def y(row):
+    return 40 - cell_size * row
+
+  nodes = []
   cum_freq = 0
-  for i, (kanji, info) in enumerate(sorted(kanji_info.items(),
-                                   key=lambda kv: sort_by(kv[1]))[0:5000]):
+  for i, (kanji, info) in enumerate(sorted_info):
     cum_freq += info.frequency
 
     row = int(i / num_cols)
-    x = cell_size * (i % num_cols) - 56.5
-    y = 40 - cell_size * row
-    if not minimal:
-      tex += "\\node[Square] at (%f, %f) {};\n" % (x, y)
-    tex += "\\node[Kanji] at (%f, %f) {%s};\n" % (x, y + 0.5, color(kanji, choose_color(info)))
-    if not minimal:
-      if is_set(info.onyomi):
-        onyomi = jaconv.hira2kata(info.onyomi.split(',')[0])
-        tex += "\\node[Onyomi] at (%f, %f) {%s};\n" % (x + 0.05, y + 0.1, onyomi)
-      if is_set(info.kunyomi):
-        tex += "\\node[Kunyomi] at (%f, %f) {%s};\n" % (x - 0.05, y + 0.1, info.kunyomi.split(',')[0])
-      meaning = info.meaning.split(',')[0]
-      if kanji in _MEANING_REPLACEMENTS:
-        print('replaced', i, kanji, meaning, _MEANING_REPLACEMENTS[kanji])
-        meaning = _MEANING_REPLACEMENTS[kanji]
-      if len(meaning) > 14:
-        print(i, kanji, meaning, )
-      tex += "\\node[Meaning] at (%f, %f) {%s};\n" % (x, y + 1.75, meaning)
-      # tex += "\\node[Meaning] at (%f, %f) {%.2f\\%%};\n" % (x, y + 1.75, cum_freq * 100)
+    col = i % num_cols
+
+    nodes.extend(render_kanji(kanji, info, x(col), y(row), minimal))
 
     if (i + 1) % num_cols == 0 or (i + 1) == len(kanji_info):
       # If this is the last character in the row, record the cumulative
       # frequency reached.
-      tex += "\\node[Meaning] at (%f, %f) {%.2f\\%%};\n" % (
-        -58.5,
-        38.8 - row * cell_size + 1.75,
-        cum_freq * 100)
+      nodes.append(tikz_node('Meaning', x(-1), y(row) + 0.6,
+                             '%.2f\\%%' % (cum_freq * 100)))
 
+  # Indicate the numbers of the kanji in each row.
   for row in range(int(math.ceil(len(kanji_info) / num_cols))):
-    tex += "\\node[Meaning] at (%f, %f) {%d - %d};\n" % (
-      -58.5,
-      39.4 - row * cell_size + 1.75,
-      row * num_cols + 1, (row + 1) * num_cols)
+    nodes.append(tikz_node('Meaning', x(-1), y(row) + 1.2,
+                           '%d - %d' % (row * num_cols + 1, (row + 1) * num_cols)))
 
-  return tex
+  return '\n'.join(nodes)
 
 def make_sort_function(index):
   def get_key(info):
@@ -279,19 +264,35 @@ def get_key(info):
   return get_key
 
 def main():
+  parser = argparse.ArgumentParser(description="Generate kanji poster LaTeX source")
+  parser.add_argument('--sort_by',
+                      choices=['wanikani'] + list(_SORT_INDICES.keys()),
+                      default='heisig',
+                      help='How to sort Kanji on the poster, default=heisig')
+  parser.add_argument('--minimal', default='minimal', action='store_true')
+  parser.set_defaults(minimal=False)
+
+  args = parser.parse_args()
+
   kanji_info = read_wanikanji()
   merge_with_joyo(kanji_info)
   add_frequency(kanji_info)
+
+  # We don't use radical data at the moment, but it could be useful to
+  # sort/group Kanji.
   add_radicals(kanji_info)
+
   add_sort_orders(kanji_info)
 
   with open('footer.tex', 'w') as f:
     f.write('%d kanji covering %.2f\\%% of common Japanese text, ordered by frequency.' % (
-      len(kanji_info),
-      100 * sum(info.frequency for info in kanji_info.values())))
+        len(kanji_info),
+        100 * sum(info.frequency for info in kanji_info.values())))
 
   with open('kanji_grid.tex', 'w') as f:
-    f.write(generate_poster_tex(kanji_info, make_sort_function('heisig'), minimal=False))
+    f.write(generate_poster_tex(kanji_info,
+                                make_sort_function(args.sort_by),
+                                minimal=args.minimal))
 
 
 if __name__ == '__main__':

diff --git a/main.pdf b/main.pdf
diff --git a/main.tex b/main.tex
@@ -8,58 +8,19 @@
 
 \usetheme{Simple}
 
-\newcommand{\Size}{2.05cm}
-\tikzset{Square/.style={
-    inner sep=0pt,
-    text width=\Size,
-    text height=\Size,
-    minimum size=\Size,
-    draw=lightgray,
-    ultra thin,
-    align=center,
-    }
+\newcommand{\Size}{2.05cm} % must match cell_size in list_kanji.py.
+\tikzset{BaseStyle/.style={
+  inner sep=0pt, text width=\Size, text height=\Size, minimum size=\Size}
 }
 
-\tikzset{Kanji/.style={
-    inner sep=0pt,
-    text width=\Size,
-    text height=\Size,
-    minimum size=\Size,
-    font=\fontsize{36}{43},
-    align=center,
-    }
-}
-
-\tikzset{Onyomi/.style={
-    inner sep=0pt,
-    text width=\Size,
-    text height=\Size,
-    minimum size=\Size,
-    font=\fontsize{8}{9},
-    align=left,
-    }
-}
+\tikzset{Square/.style={BaseStyle, draw=lightgray, ultra thin, align=center}}
 
-\tikzset{Kunyomi/.style={
-    inner sep=0pt,
-    text width=\Size,
-    text height=\Size,
-    minimum size=\Size,
-    font=\fontsize{8}{9},
-    align=right,
-    }
-}
+\tikzset{Kanji/.style={BaseStyle, font=\fontsize{36}{43}, align=center}}
 
-\tikzset{Meaning/.style={
-    inner sep=0pt,
-    text width=\Size,
-    text height=\Size,
-    minimum size=\Size,
-    font=\scriptsize,
-    align=center,
-    text=gray,
-    }
-}
+\tikzset{SmallGreyText/.style={BaseStyle, font=\fontsize{8}{9}, text=gray}}
+\tikzset{Onyomi/.style={SmallGreyText, align=left}}
+\tikzset{Kunyomi/.style={SmallGreyText, align=right}}
+\tikzset{Meaning/.style={SmallGreyText, align=center, font=\scriptsize}}
 
 \begin{document}
 
@@ -69,9 +30,8 @@
 \end{CJK}
 
 \node [above right,outer sep=10pt,minimum width=\paperwidth,align=center] at (bottomleft) {
-\input{footer}
-Kanji data from \url{https://www.wanikani.com} and \url{https://en.wikipedia.org/wiki/List_of_joyo_kanji}.
-
+  \input{footer}
+  Kanji data from \url{https://www.wanikani.com} and \url{https://en.wikipedia.org/wiki/List_of_joyo_kanji}.
 };
 
 \end{document}