User:Opencooper/bindKana.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
![]() | Documentation for this user script can be added at User:Opencooper/bindKana. |
// This script takes kanji with ruby text over it and removes repeated parts
// It's called automatically by showKanji.js if any furigana was added
// The basic algorithm searches for *continuous* hiragana/katakana/latin/punctuation
// strings that are in both the base and reading, and splits on these. This does
// not take into account any lexical information (so it doesn't know anything about
// particles or individual kanji readings). It can also fail for more complicated
// cases, but the script should be able to abort for these (maybe in the future we can
// continue and just ignore that specific base and substring).
// License: CC0
function getKanjiInfo() {
// Don't run if the kanji or the ruby is hidden
if ($("#kanjiInfo").css("display") == "none" || $("#kanjiInfo rt").css("display") == "none") {
return;
}
var kanji = $("#kanjiInfo ruby")[0].childNodes[0].nodeValue;
var kana = $("#kanjiInfo rt").text();
if (!kanji || !kana) {
return;
}
var bases = [kanji];
var readings = [kana];
bindKana(bases, readings);
// If any binding occured
if (bases.length > 1) {
displayBoundKana(bases, readings);
}
}
function bindKana(bases, readings) {
var iterations = 0;
var maxIterations = 25;
var foundBindings = true;
while (foundBindings && iterations != maxIterations) {
iterations++;
foundBindings = tryBind(bases, readings);
}
// Sanity check
if (bases.length != readings.length) {
throw new Error("bindKana.js: Bases and readings arrays don't have same lengths.");
}
// Check kanji:kana ratio
for (var i = 0; i < bases.length; i++) {
var kanjiLength = bases[i].length;
var kanaLength = readings[i].length;
if (kanjiLength === 0 || kanaLength === 0) { continue; }
var ratio = kanaLength / kanjiLength;
if (ratio >= 6 || ratio <= 1/6) {
throw new Error("bindKana.js: kanji:kana ratio greater than 6 for `"
+ bases[i] + "` and `" + readings[i] + "`.");
}
}
if (iterations == maxIterations - 1) {
console.warn("bindKana.js: Encountered maximum iterations.");
if (bases.length == 1) {
throw new Error("bindKana.js: Encountered maximum iterations while furigana wasn't split once.");
}
}
}
function tryBind(bases, readings) {
var regexes = [kanaRegexes.katakanaRe, kanaRegexes.alphanumRe,
kanaRegexes.hiraganaRe, kanaRegexes.miscRe];
var baseLength = bases.length;
for (var i = 0; i < baseLength; i++) {
if (readings[i] === "") {
continue;
}
for (var regex of regexes) {
searchBase(bases, readings, i, regex);
if (bases.length != baseLength) {
break;
}
}
}
if (bases.length != baseLength) {
// Make sure splitting didn't mess up the bindings
for (var j = 0; j < bases.length; j++) {
if (kanaRegexes.kanjiRe.test(bases[j]) && readings[j] === "") {
throw new Error("bindKana.js: Kanji base with no reading: `"
+ bases[j] + "` at index " + j);
} else if (bases[j] === "" && readings[j]) {
throw new Error("bindKana.js: Blank base with reading: `"
+ readings[j] + "` at index " + j);
}
}
return true;
} else {
return false;
}
}
function searchBase(bases, readings, index, re) {
var baseLength = bases.length;
var substring = bases[index].match(re);
if (substring) {
for (var j = 0; j < substring.length; j++) {
// Handle case where the furigana is just a hiragana version of the katakana
// Only works if whole thing is split along the reading
if (re == kanaRegexes.katakanaRe && /^[ァ-ヴ]+$/.test(bases[index])
&& bases[index] == readings[index].hiraganaToKatakana()) {
readings[index] = readings[index].hiraganaToKatakana();
}
// Misc stuff like whitespace should be split searching forward
if (re !== kanaRegexes.miscRe) {
splitFuriganaReverse(bases, readings, index, substring[j]);
} else {
splitFuriganaForward(bases, readings, index, substring[j]);
}
// Check if we split on the substring
if (bases.length != baseLength) {
// Splitting should result in [l|match|r] w/ ruby of [l|""|r]
if (bases.length != baseLength + 2) {
throw new Error("bindKana.js: Splitting added more than two new parts.");
}
return;
}
}
}
}
String.prototype.hiraganaToKatakana = function() {
return this.replace(/[\u3041-\u3096]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) + 0x0060)});
};
// We search for everything reversed because particles are suffixes
function splitFuriganaReverse(bases, readings, index, substring) {
var baseReversed = reverseString(bases[index]);
var readingReversed = reverseString(readings[index]);
var substringReversed = reverseString(substring);
var substringEscaped = mw.util.escapeRegExp(substringReversed);
var substringRe = new RegExp(substringEscaped);
// We match everything to left of substring, substring, and then right side
var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");
// First make sure substring is in both the base and its reading
if (substringRe.test(baseReversed) && substringRe.test(readingReversed)) {
// Insert substring into base
var baseSearch = baseReversed.match(substringSearch);
// AaBbCc -> "cC" | "bB" | "aA"
var baseLeftSide = reverseString(baseSearch[3]);
var baseRightSide = reverseString(baseSearch[1]);
// Start at index, delete one element, and then insert the other parameters
bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
var readingSearch = readingReversed.match(substringSearch);
var readingLeftSide = reverseString(readingSearch[3]);
var readingRightSide = reverseString(readingSearch[1]);
readings.splice(index, 1, readingLeftSide, "", readingRightSide);
}
}
function reverseString(str) {
return str.split("").reverse().join("");
}
// TODO: Generalize this with reverse somehow
function splitFuriganaForward(bases, readings, index, substring) {
var substringEscaped = mw.util.escapeRegExp(substring);
var substringRe = new RegExp(substringEscaped);
var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");
if (substringRe.test(bases[index]) && substringRe.test(readings[index])) {
var baseSearch = bases[index].match(substringSearch);
var baseLeftSide = baseSearch[1];
var baseRightSide = baseSearch[3];
// Start at index, delete one element, and then insert the other parameters
bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
var readingSearch = readings[index].match(substringSearch);
var readingLeftSide = readingSearch[1];
var readingRightSide = readingSearch[3];
readings.splice(index, 1, readingLeftSide, "", readingRightSide);
}
}
function displayBoundKana(bases, readings) {
$("#kanjiInfo ruby").addClass("unbound");
$(".unbound").css("display", "none");
var fromWikidata = false;
// Build new ruby element from the two bases and readings arrays
var newKana = "<ruby class='bound'>";
for (var i = 0; i < bases.length; i++) {
newKana += "<rb>" + bases[i] + "</rb>";
newKana += "<rt>" + readings[i] + "</rt>";
}
newKana += "</ruby>";
$("#kanjiInfo").append(newKana);
prettifyEnds();
}
function prettifyEnds() {
// Exclude misc characters from base; for nicer formatting
$("#kanjiInfo rb").each(function(){
var baseText = $(this).text();
// Rm empty ruby base and readings
if (baseText === "") {
$(this).next().remove();
$(this).remove();
return;
} else if (baseText === " ") {
return;
}
var start = baseText[0];
kanaRegexes.miscRe.lastIndex = 0; // reset regex
if (kanaRegexes.miscRe.test(start)) {
var startRemainder = baseText.slice(1);
$(this).text(startRemainder);
$(this).before("<rb>" + start + "</rb><rt></rt>");
}
baseText = $(this).text();
kanaRegexes.miscRe.lastIndex = 0;
var end = baseText.slice(-1);
if (kanaRegexes.miscRe.test(end)) {
var len = baseText.length;
var endRemainder = baseText.slice(0, len-1);
$(this).text(endRemainder);
$(this).next().after("<rb>" + end + "</rb><rt></rt>");
}
});
}
var kanaRegexes = {
kanjiRe: /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]/,
// kanjiRe: /[一-龯]+/g,
hiraganaRe: /[ぁ-ゔ]+/g,
katakanaRe: /[ァ-ヴー]+/g,
alphanumRe: /[A-Za-z0-9]+/g,
miscRe: /[- !.?・、「」×〜&/]/g
}
getKanjiInfo();