Skip to content

Commit

Permalink
clean up the code a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
Mononofu committed Oct 1, 2019
1 parent ff7f3d3 commit e8e85bd
Show file tree
Hide file tree
Showing 6 changed files with 12,519 additions and 10,525 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Kanji Poster


## Dependencies

We use [colour](https://pypi.org/project/colour/) to interpolate colors and [jaconv](https://pypi.org/project/jaconv/) to convert between hiragana and katakana for readings; you can install both from pip.
20,744 changes: 10,372 additions & 10,372 deletions kanji_grid.tex

Large diffs are not rendered by default.

205 changes: 103 additions & 102 deletions list_kanji.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# -*- coding: utf-8
import argparse
import collections
import colour
import csv
import itertools
import json
import math
import re

import colour
import jaconv

class KanjiInfo(object):
Expand All @@ -19,18 +20,22 @@ def __init__(self, meaning, onyomi, kunyomi, wanikani_level=None, grade=None):
self.frequency = None
self.indices = {}

_NUM_WANIKANI_LEVELS = 60

def read_wanikanji():
kanji_info = {}

for level in range(1, _NUM_WANIKANI_LEVELS + 1):
with open('/home/mononofu/Dropbox/eBooks/languages/japanese/WaniKani/with_similar_kanji/%02d_kanji.csv' % level) as f:
lines = f.read()
for line in lines.strip().split('\n'):
parts = line.split(';')
kanji, meaning, onyomi, kunyomi = parts[2], parts[3], parts[5], parts[6]
kanji_info[kanji] = KanjiInfo(meaning, onyomi, kunyomi, wanikani_level=int(level))
def first_reading(reading):
if not reading or reading in ['None', 'N/A']:
return None
return reading.split(',')[0]

with open('wanikani.csv') as f:
lines = f.read()
for line in lines.strip().split('\n'):
level, kanji, meaning, onyomi, kunyomi = line.split(';')
kanji_info[kanji] = KanjiInfo(meaning,
first_reading(onyomi),
first_reading(kunyomi),
wanikani_level=int(level))

return kanji_info

Expand All @@ -40,7 +45,6 @@ def strip_link(wiki_text):
raise ValueError("mismatch in: %s" % wiki_text)
return a


# Some kanji are outside the basic character set, instead these replacement
# characters are used in practice (see
# https://en.wikipedia.org/wiki/List_of_jōyō_kanji#List_of_characters for
Expand All @@ -53,6 +57,7 @@ def strip_link(wiki_text):
}

def merge_with_joyo(kanji_info):
# Dumped from https://en.wikipedia.org/wiki/List_of_jōyō_kanji.
with open('joyo_kanji.txt') as f:
wiki_joyo = f.read()
for line in wiki_joyo.strip().split('\n')[8:]:
Expand Down Expand Up @@ -93,25 +98,26 @@ def add_frequency(kanji_info):
if unseen_kanji:
print('failed to find frequency info for: ', ', '.join(unseen_kanji))

_SORT_INDICES = {
'heisig': 'Heisig RTK Index',
'rtk2': 'RTK2 Index',
'2k1K0': '2k1KO Index',
'opt_vocab_sort': 'Opt Vocab Sort Index',
'kanji_leaner_course': 'Kanji Learner Course Index',
'frequency': 'Freq.'
}

def add_sort_orders(kanji_info):
# From https://docs.google.com/spreadsheets/d/19zorQpMJi00-b6abuvE5uBAIsMMqWVrbeHD-bIrkggQ/
with open('kanken_heisig.csv') as f:
reader = csv.DictReader(f)
kanji_to_row = {row['Kanji']: row for row in reader}

indices = {
'heisig': 'Heisig RTK Index',
'rtk2': 'RTK2 Index',
'2k1K0': '2k1KO Index',
'opt_vocab_sort': 'Opt Vocab Sort Index',
'kanji_leaner_course': 'Kanji Learner Course Index',
'frequency': 'Freq.'
}

for kanji, info in kanji_info.items():
if kanji in kanji_to_row:
row = kanji_to_row[kanji]
for label, column in indices.items():
for label, column in _SORT_INDICES.items():
if row[column]:
info.indices[label] = int(row[column])

Expand All @@ -136,49 +142,6 @@ def add_radicals(kanji_info):
print('failed to find radicals for: ', ', '.join(unseen_kanji))


def group_by_radicals(kanji_info):
seen = set()

predefined_radical_groups = ['辶']

# First take kanji who are their own radicals.
grouped_kanji = {r: [] for r in predefined_radical_groups}
for kanji, info in kanji_info.items():
if not info.radicals or info.radicals == [kanji]:
seen.add(kanji)
if kanji not in grouped_kanji:
grouped_kanji[kanji] = []
grouped_kanji[kanji].append(kanji)

for kanji, info in kanji_info.items():
if kanji in seen:
continue
for radical in info.radicals:
if radical in grouped_kanji:
seen.add(kanji)
grouped_kanji[radical].append(kanji)
break

for radical, kanjis in grouped_kanji.items():
print(radical, ':', ''.join(kanjis))

print('述', kanji_info['述'].radicals)

radical_to_kanji = collections.defaultdict(list)
for kanji, info in kanji_info.items():
for n in range(1, len(info.radicals) + 1):
for radical_subset in itertools.combinations(info.radicals, n):
radical_to_kanji[''.join(radical_subset)].append(kanji)

# seen = set()
# for radical_subset, kanjis in sorted(radical_to_kanji.items(), key=lambda kv: len(kv[0]), reverse=True):
# unseen = [k for k in kanjis if k not in seen]
# if len(unseen) > 1:
# print(radical_subset, unseen)
# for kanji in kanjis:
# seen.add(kanji)


# Some of the meanings are too long to fit in one line, so we replace them with
# a shorter version.
_MEANING_REPLACEMENTS = {
Expand All @@ -203,9 +166,6 @@ def group_by_radicals(kanji_info):
}


def is_set(text):
return text and text not in ['N/A', 'None']

def color(text, c):
return r'\textcolor[HTML]{%s}{%s}' % (c, text)

Expand All @@ -222,52 +182,77 @@ def choose_color(info):
index = int((log_freq - min_freq) / (max_freq - min_freq) * len(_COLORS))
return _COLORS[index].hex[1:]

def generate_poster_tex(kanji_info, sort_by, minimal=False):
cell_size = 2.05
def tikz_node(kind, x, y, text=''):
return "\\node[%s] at (%f, %f) {%s};" % (kind, x, y, text)

def render_kanji(kanji, info, x, y, minimal):
"""Renders a kanji and related information at the specified xy position."""
nodes = []

def add_node(kind, dx, dy, text=''):
"""Adds a tikz node with the specified offset from the center."""
nodes.append(tikz_node(kind, x + dx, y + dy, text))

add_node('Kanji', 0, 0.5, color(kanji, choose_color(info)))

if not minimal:
add_node('Square', 0, 0)

if info.onyomi:
add_node('Onyomi', 0.05, 0.1, jaconv.hira2kata(info.onyomi))

if info.kunyomi:
add_node('Kunyomi', -0.05, 0.1, info.kunyomi)

meaning = info.meaning.split(',')[0]
if kanji in _MEANING_REPLACEMENTS:
meaning = _MEANING_REPLACEMENTS[kanji]
add_node('Meaning', 0, 1.75, meaning)

return nodes

def generate_poster_tex(kanji_info, sort_by, minimal=False, first_n=None):
"""Generates Tex to render all kanji in kanji_info in a big poster."""
sorted_info = sorted(kanji_info.items(), key=lambda kv: sort_by(kv[1]))
if first_n:
sorted_info = sorted_info[:first_n]

# The center of the poster is at (0, 0). Since we are using an A0 landscape
# poster, the total width is 118.9 and the height 84.1, so the top left corner
# is at roughly x=59.4 and y=42.
# TODO: We could derive cell sizes and column/row count automatically based on
# the number of Kanji we want to show.
cell_size = 2.05 # Must match \Size in main.tex.
num_cols = 56

tex = ""
def x(col):
return cell_size * col - 56

def y(row):
return 40 - cell_size * row

nodes = []
cum_freq = 0
for i, (kanji, info) in enumerate(sorted(kanji_info.items(),
key=lambda kv: sort_by(kv[1]))[0:5000]):
for i, (kanji, info) in enumerate(sorted_info):
cum_freq += info.frequency

row = int(i / num_cols)
x = cell_size * (i % num_cols) - 56.5
y = 40 - cell_size * row
if not minimal:
tex += "\\node[Square] at (%f, %f) {};\n" % (x, y)
tex += "\\node[Kanji] at (%f, %f) {%s};\n" % (x, y + 0.5, color(kanji, choose_color(info)))
if not minimal:
if is_set(info.onyomi):
onyomi = jaconv.hira2kata(info.onyomi.split(',')[0])
tex += "\\node[Onyomi] at (%f, %f) {%s};\n" % (x + 0.05, y + 0.1, onyomi)
if is_set(info.kunyomi):
tex += "\\node[Kunyomi] at (%f, %f) {%s};\n" % (x - 0.05, y + 0.1, info.kunyomi.split(',')[0])
meaning = info.meaning.split(',')[0]
if kanji in _MEANING_REPLACEMENTS:
print('replaced', i, kanji, meaning, _MEANING_REPLACEMENTS[kanji])
meaning = _MEANING_REPLACEMENTS[kanji]
if len(meaning) > 14:
print(i, kanji, meaning, )
tex += "\\node[Meaning] at (%f, %f) {%s};\n" % (x, y + 1.75, meaning)
# tex += "\\node[Meaning] at (%f, %f) {%.2f\\%%};\n" % (x, y + 1.75, cum_freq * 100)
col = i % num_cols

nodes.extend(render_kanji(kanji, info, x(col), y(row), minimal))

if (i + 1) % num_cols == 0 or (i + 1) == len(kanji_info):
# If this is the last character in the row, record the cumulative
# frequency reached.
tex += "\\node[Meaning] at (%f, %f) {%.2f\\%%};\n" % (
-58.5,
38.8 - row * cell_size + 1.75,
cum_freq * 100)
nodes.append(tikz_node('Meaning', x(-1), y(row) + 0.6,
'%.2f\\%%' % (cum_freq * 100)))

# Indicate the numbers of the kanji in each row.
for row in range(int(math.ceil(len(kanji_info) / num_cols))):
tex += "\\node[Meaning] at (%f, %f) {%d - %d};\n" % (
-58.5,
39.4 - row * cell_size + 1.75,
row * num_cols + 1, (row + 1) * num_cols)
nodes.append(tikz_node('Meaning', x(-1), y(row) + 1.2,
'%d - %d' % (row * num_cols + 1, (row + 1) * num_cols)))

return tex
return '\n'.join(nodes)

def make_sort_function(index):
def get_key(info):
Expand All @@ -279,19 +264,35 @@ def get_key(info):
return get_key

def main():
parser = argparse.ArgumentParser(description="Generate kanji poster LaTeX source")
parser.add_argument('--sort_by',
choices=['wanikani'] + list(_SORT_INDICES.keys()),
default='heisig',
help='How to sort Kanji on the poster, default=heisig')
parser.add_argument('--minimal', default='minimal', action='store_true')
parser.set_defaults(minimal=False)

args = parser.parse_args()

kanji_info = read_wanikanji()
merge_with_joyo(kanji_info)
add_frequency(kanji_info)

# We don't use radical data at the moment, but it could be useful to
# sort/group Kanji.
add_radicals(kanji_info)

add_sort_orders(kanji_info)

with open('footer.tex', 'w') as f:
f.write('%d kanji covering %.2f\\%% of common Japanese text, ordered by frequency.' % (
len(kanji_info),
100 * sum(info.frequency for info in kanji_info.values())))
len(kanji_info),
100 * sum(info.frequency for info in kanji_info.values())))

with open('kanji_grid.tex', 'w') as f:
f.write(generate_poster_tex(kanji_info, make_sort_function('heisig'), minimal=False))
f.write(generate_poster_tex(kanji_info,
make_sort_function(args.sort_by),
minimal=args.minimal))


if __name__ == '__main__':
Expand Down
Binary file modified main.pdf
Binary file not shown.
62 changes: 11 additions & 51 deletions main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -8,58 +8,19 @@

\usetheme{Simple}

\newcommand{\Size}{2.05cm}
\tikzset{Square/.style={
inner sep=0pt,
text width=\Size,
text height=\Size,
minimum size=\Size,
draw=lightgray,
ultra thin,
align=center,
}
\newcommand{\Size}{2.05cm} % must match cell_size in list_kanji.py.
\tikzset{BaseStyle/.style={
inner sep=0pt, text width=\Size, text height=\Size, minimum size=\Size}
}

\tikzset{Kanji/.style={
inner sep=0pt,
text width=\Size,
text height=\Size,
minimum size=\Size,
font=\fontsize{36}{43},
align=center,
}
}

\tikzset{Onyomi/.style={
inner sep=0pt,
text width=\Size,
text height=\Size,
minimum size=\Size,
font=\fontsize{8}{9},
align=left,
}
}
\tikzset{Square/.style={BaseStyle, draw=lightgray, ultra thin, align=center}}

\tikzset{Kunyomi/.style={
inner sep=0pt,
text width=\Size,
text height=\Size,
minimum size=\Size,
font=\fontsize{8}{9},
align=right,
}
}
\tikzset{Kanji/.style={BaseStyle, font=\fontsize{36}{43}, align=center}}

\tikzset{Meaning/.style={
inner sep=0pt,
text width=\Size,
text height=\Size,
minimum size=\Size,
font=\scriptsize,
align=center,
text=gray,
}
}
\tikzset{SmallGreyText/.style={BaseStyle, font=\fontsize{8}{9}, text=gray}}
\tikzset{Onyomi/.style={SmallGreyText, align=left}}
\tikzset{Kunyomi/.style={SmallGreyText, align=right}}
\tikzset{Meaning/.style={SmallGreyText, align=center, font=\scriptsize}}

\begin{document}

Expand All @@ -69,9 +30,8 @@
\end{CJK}

\node [above right,outer sep=10pt,minimum width=\paperwidth,align=center] at (bottomleft) {
\input{footer}
Kanji data from \url{https://www.wanikani.com} and \url{https://en.wikipedia.org/wiki/List_of_joyo_kanji}.

\input{footer}
Kanji data from \url{https://www.wanikani.com} and \url{https://en.wikipedia.org/wiki/List_of_joyo_kanji}.
};

\end{document}
Loading

0 comments on commit e8e85bd

Please sign in to comment.