This code snippet implements a parser for Japanese kanji with Furigana.
Format
Every kanji is contained inside {}
parenthesis. The kanji symbol is added first, followed by a separator (|
) and then the furigana.
These tokens are defined in an
enum
in the script:enum KanjiToken { KANJI_START = "{", KANJI_END = "}", KANJI_SEPARATOR = "|" }
You can change these tokens by updating the characters in the
enum
Here are some examples:
{ζΌ’|γγ}{ε|γ}
{ζ|γγ}{ζ |γγγ}
{ζ±|γ―γγ}
The parser also accepts kana characters everywhere in the word. For example:
{ι£|γ}γΉγ
{ι£|γ¨}γΉγ
{ε|γ}γ
{ε§ͺ|γγ}γ£{ε|γ}
Kanji characters are always required to have furigana βοΈ
Dependencies
The script has a dependency on the wanakana
library to check for valid kanji and kana characters.
Note: This dependency is not required. You may decide to implement
isKanji
andisKana
on your own.
Below is the code extract from wanakana
for isKanji
and isKana
(converted from javascript to typescript). You could use the code below instead of installing wanakana
as a dependency:
const KANJI_START = 0x4e00;
const KANJI_END = 0x9faf;
const HIRAGANA_START = 0x3041;
const HIRAGANA_END = 0x3096;
const KATAKANA_START = 0x30a1;
const KATAKANA_END = 0x30fc;
const PROLONGED_SOUND_MARK = 0x30fc;
const isCharInRange = (char: string, start: number, end: number): boolean => {
const code = char.charCodeAt(0);
return start <= code && code <= end;
};
const isCharKanji = (char: string): boolean =>
isCharInRange(char, KANJI_START, KANJI_END);
const isCharLongDash = (char: string): boolean =>
char.charCodeAt(0) === PROLONGED_SOUND_MARK;
const isCharHiragana = (char: string): boolean => {
if (isCharLongDash(char)) return true;
return isCharInRange(char, HIRAGANA_START, HIRAGANA_END);
};
const isCharKatakana = (char: string): boolean =>
isCharInRange(char, KATAKANA_START, KATAKANA_END);
const isCharKana = (char: string): boolean =>
isCharHiragana(char) || isCharKatakana(char);
export const isKana = (input: string): boolean => [...input].every(isCharKana);
export const isKanji = (input: string): boolean => [...input].every(isCharKanji);
Result
The result contains a list of objects (KanjiWord
):
interface Kanji {
symbol: string;
furigana: string;
}
/** Discriminated Unions: `_tag` used to distinguish between kanji (with furigana) and kana characters */
type KanjiWord =
| { _tag: "kanji"; value: Kanji }
| { _tag: "kana"; value: string };
For example, the string γγ{ζ°|γ}γΎγ«{ι
|γγ°}γ«γ
returns the following array:
[
{ _tag: "kana", value: "γγ" },
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
{ _tag: "kana", value: "γΎγ«" },
{ _tag: "kanji", value: { symbol: "ι
", furigana: "γγ°" } },
{ _tag: "kana", value: "γ«γ" }
]
You can use _tag
to distinguish between kana and kanji:
if (kanjiWord._tag === 'kana') {
const value: string = kanjiWord.value;
// ...
} else if (kanjiWord._tag === 'kanji') {
const value: Kanji = kanjiWord.value;
// ...
}
Full script
The script exports a single parser
function that accepts a string
as input and returns a non-empty list of KanjiWord
when successful, or an error otherwise.
Here is the full script:
import { isKana, isKanji as isKanjiWanakana } from "wanakana";
const isKanji = (str: string) =>
str.split("").every((char) => isKanjiWanakana(char) || char === "γ
");
enum KanjiToken {
KANJI_START = "{",
KANJI_END = "}",
KANJI_SEPARATOR = "|",
}
interface Kanji {
symbol: string;
furigana: string;
}
type KanjiWord =
| { _tag: "kanji"; value: Kanji }
| { _tag: "kana"; value: string };
interface ParserError {
_tag: "Error";
value: string;
}
type ParserResultTemp =
| ParserError
| {
_tag: "Success";
value: KanjiWord;
nextSource: string;
};
export type ParserResult =
| ParserError
| {
_tag: "Success";
value: { 0: KanjiWord } & KanjiWord[];
};
const parserKanji = (source: string): ParserResultTemp => {
if (source.length === 0) {
return { _tag: "Error", value: `Kanji is empty` };
} else if (source[0] !== KanjiToken.KANJI_START) {
return { _tag: "Error", value: `Missing kanji start token in "${source}"` };
}
let index = 1;
let char = source[index];
let symbol = "";
while (char !== KanjiToken.KANJI_SEPARATOR) {
if (index === source.length) {
return {
_tag: "Error",
value: `Missing kanji separator token ("${KanjiToken.KANJI_SEPARATOR}") in "${source}"`,
};
}
symbol += char;
index += 1;
char = source[index];
}
if (symbol.length === 0) {
return {
_tag: "Error",
value: `Kanji symbol is empty in "${source}"`,
};
} else if (!isKanji(symbol)) {
return {
_tag: "Error",
value: `Invalid kanji symbol in "${symbol}" for "${source}"`,
};
}
// Skip separator
index += 1;
char = source[index];
let furigana = "";
while (char !== KanjiToken.KANJI_END) {
if (index === source.length) {
return {
_tag: "Error",
value: `Missing kanji end token ("${KanjiToken.KANJI_END}") in "${source}"`,
};
}
furigana += char;
index += 1;
char = source[index];
}
if (furigana.length === 0) {
return {
_tag: "Error",
value: `Kanji furigana is empty in "${source}" for symbol "${symbol}"`,
};
} else if (!isKana(furigana)) {
return {
_tag: "Error",
value: `Invalid furigana characters in "${furigana}" for "${source}"`,
};
}
return {
_tag: "Success",
value: { _tag: "kanji", value: { symbol, furigana } },
nextSource: source.slice(symbol.length + furigana.length + 3),
};
};
const parserKana = (source: string): ParserResultTemp => {
const takeWhileKana = (str: string): string => {
const char = str[0];
if (str.length > 0 && isKana(char)) {
return `${char}${takeWhileKana(str.slice(1))}`;
} else {
return ``;
}
};
const kana = takeWhileKana(source);
if (kana.length === 0) {
return { _tag: "Error", value: `Kana characters missing in "${source}"` };
} else if (!isKana(kana)) {
return {
_tag: "Error",
value: `Invalid kana characters in "${kana}" for "${source}"`,
};
}
return {
_tag: "Success",
value: { _tag: "kana", value: kana },
nextSource: source.slice(kana.length),
};
};
export const parser = (source: string): ParserResult => {
if (source.length === 0) {
return { _tag: "Error", value: `Source is empty` };
}
const kanjiWordList: KanjiWord[] = [];
let index = 0;
let parseSource = source;
while (parseSource.length > 0) {
let char = parseSource[index];
if (char === KanjiToken.KANJI_START) {
const kanji = parserKanji(parseSource);
if (kanji._tag === "Error") {
return kanji;
}
parseSource = kanji.nextSource;
kanjiWordList.push(kanji.value);
} else {
const kana = parserKana(parseSource);
if (kana._tag === "Error") {
return kana;
}
parseSource = kana.nextSource;
kanjiWordList.push(kana.value);
}
}
if (kanjiWordList.length === 0) {
return { _tag: "Error", value: `Kanji is empty in "${source}"` };
}
return {
_tag: "Success",
value: [kanjiWordList[0], ...kanjiWordList.slice(1)],
};
};
Testing
The parser
function has been tested on multiple inputs (using vitest
):
All tests are passing for some general cases of both success and error when parsing
import { describe, expect, test } from "vitest";
import { ParserResult, parser } from "./parser";
describe("Success", () => {
test("double kanji and single kana", () => {
expect(parser("{ζ°|γ}{ι
|γγ°}γ")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
{ _tag: "kanji", value: { symbol: "ι
", furigana: "γγ°" } },
{ _tag: "kana", value: "γ" },
],
});
});
test("double kanji with kana in between", () => {
expect(parser("{ζ°|γ}γ{ι
|γγ°}")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
{ _tag: "kana", value: "γ" },
{ _tag: "kanji", value: { symbol: "ι
", furigana: "γγ°" } },
],
});
});
test("double kana, kanji, double kana, kanji, double kana", () => {
expect(parser("γγ{ζ°|γ}γΎγ«{ι
|γγ°}γ«γ")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [
{ _tag: "kana", value: "γγ" },
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
{ _tag: "kana", value: "γΎγ«" },
{ _tag: "kanji", value: { symbol: "ι
", furigana: "γγ°" } },
{ _tag: "kana", value: "γ«γ" },
],
});
});
test("single kanji and kana", () => {
expect(parser("{ζ°|γ}γ")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
{ _tag: "kana", value: "γ" },
],
});
});
test("single kanji", () => {
expect(parser("{ζ°|γ}")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } }],
});
});
test("single kana and single kanji", () => {
expect(parser("γ{ζ°|γ}")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [
{ _tag: "kana", value: "γ" },
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
],
});
});
test("double kana and single kanji", () => {
expect(parser("γγ{ζ°|γ}")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [
{ _tag: "kana", value: "γγ" },
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
],
});
});
test("double kana and double kanji", () => {
expect(parser("γγ{ζ°|γ}{ζ|γγ}")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [
{ _tag: "kana", value: "γγ" },
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
{ _tag: "kanji", value: { symbol: "ζ", furigana: "γγ" } },
],
});
});
test("double kana and double kanji and kana", () => {
expect(parser("γγ{ζ°|γ}{ζ|γγ}γ")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [
{ _tag: "kana", value: "γγ" },
{ _tag: "kanji", value: { symbol: "ζ°", furigana: "γ" } },
{ _tag: "kanji", value: { symbol: "ζ", furigana: "γγ" } },
{ _tag: "kana", value: "γ" },
],
});
});
test("special kanji γ
character", () => {
expect(parser("{γ
|γ}")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [{ _tag: "kanji", value: { symbol: "γ
", furigana: "γ" } }],
});
});
test("only kana", () => {
expect(parser("γγγ")).toStrictEqual<ParserResult>({
_tag: "Success",
value: [{ _tag: "kana", value: "γγγ" }],
});
});
});
describe("Error", () => {
test("empty string", () => {
const result = parser("");
expect(result._tag).toBe<"Error">("Error");
});
test("no kanji character", () => {
const result = parser("{γ|γγ}");
expect(result._tag).toBe<"Error">("Error");
});
test("no kana character in kanji", () => {
const result = parser("{ζ|ζ}");
expect(result._tag).toBe<"Error">("Error");
});
test("kanji without furigana", () => {
const result = parser("{ζ|γγ}ι");
expect(result._tag).toBe<"Error">("Error");
});
test("missing start kanji token", () => {
const result = parser("ζ|γγ}γ");
expect(result._tag).toBe<"Error">("Error");
});
test("missing end kanji token", () => {
const result = parser("{ζ|γγγ");
expect(result._tag).toBe<"Error">("Error");
});
test("missing separator kanji token", () => {
const result = parser("{ζγγ}γ");
expect(result._tag).toBe<"Error">("Error");
});
});
Feel free to use this snippet in your own code ππΌββοΈ