Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support Yomitan v3 schema (focusing on Jitendex) #389

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions yuuna/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ app.*.map.json
/android/app/debug
/android/app/profile
/android/app/release

# FVM Version Cache
.fvm/
60 changes: 48 additions & 12 deletions yuuna/lib/src/creator/enhancements/immersion_kit_enhancement.dart
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class ImmersionKitResult {
required this.wordList,
required this.wordIndices,
required this.calculateRange,
required this.longestExactMatch,
});

/// The sentence in plain unformatted form.
Expand All @@ -48,6 +49,9 @@ class ImmersionKitResult {

TextRange? _calculatedRange;

/// How many consecutive characters match the search term exactly
int longestExactMatch;

/// Function to calculate the range of search term
TextRange Function() calculateRange;

Expand Down Expand Up @@ -236,6 +240,26 @@ class ImmersionKitEnhancement extends Enhancement {
}
}

int _longestExactRangeForResult({
required List<int> wordIndices,
required List<String> wordList,
required String term,
required String text,
}) {
/// Start at the first character of the given cloze
int textPosition = wordList.sublist(0, wordIndices.first).join().length;
int termPosition = 0;

while (textPosition < text.length &&
termPosition < term.length &&
term[termPosition] == text[textPosition]) {
termPosition++;
textPosition++;
}

return termPosition;
}

/// Search the Massif API for example sentences and return a list of results.
Future<List<ImmersionKitResult>> searchForSentences({
required AppModel appModel,
Expand Down Expand Up @@ -290,18 +314,23 @@ class ImmersionKitEnhancement extends Enhancement {
String audioUrl = example['sound_url'];

ImmersionKitResult result = ImmersionKitResult(
text: text,
source: source,
imageUrl: imageUrl,
audioUrl: audioUrl,
wordList: wordList,
wordIndices: wordIndices,
calculateRange: () => _getRangeFromIndexedList(
wordIndices: wordIndices,
text: text,
source: source,
imageUrl: imageUrl,
audioUrl: audioUrl,
wordList: wordList,
term: searchTerm,
),
);
wordIndices: wordIndices,
calculateRange: () => _getRangeFromIndexedList(
wordIndices: wordIndices,
wordList: wordList,
term: searchTerm,
),
longestExactMatch: _longestExactRangeForResult(
wordIndices: wordIndices,
wordList: wordList,
text: text,
term: searchTerm,
));

/// Sentence examples that are merely the word itself are pretty
/// redundant.
Expand All @@ -313,7 +342,7 @@ class ImmersionKitEnhancement extends Enhancement {
/// Make sure series aren't too consecutive.
results.shuffle();

/// Results with images come first.
/// Sort by: has image -> has audio -> longest exact match -> shortest sentence
results.sort((a, b) {
int hasImage = (a.imageUrl.isNotEmpty ? -1 : 1)
.compareTo(b.imageUrl.isNotEmpty ? -1 : 1);
Expand All @@ -329,6 +358,13 @@ class ImmersionKitEnhancement extends Enhancement {
return hasAudio;
}

/// Sort by longest subterm
int longestMatch = b.longestExactMatch.compareTo(a.longestExactMatch);

if (longestMatch != 0) {
return longestMatch;
}

return a.text.length.compareTo(b.text.length);
});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class MassifResult {
required this.range,
required this.source,
required this.spans,
required this.longestExactMatch,
});

/// The sentence in plain unformatted form.
Expand All @@ -27,6 +28,9 @@ class MassifResult {
/// A formatted widget which may contain highlighted text.
List<InlineSpan> spans;

/// How many consecutive characters match the search term exactly
int longestExactMatch;

/// First selected range.
TextRange range;

Expand Down Expand Up @@ -102,6 +106,29 @@ class MassifExampleSentencesEnhancement extends Enhancement {
);
}

int _longestExactRangeForResult({
required int? start,
required String term,
required String text,
}) {
if (start == null) {
return 0;
}

/// Start at the first character of the given cloze
int textPosition = start;
int termPosition = 0;

while (textPosition < text.length &&
termPosition < term.length &&
term[termPosition] == text[textPosition]) {
termPosition++;
textPosition++;
}

return termPosition;
}

/// Search the Massif API for example sentences and return a list of results.
Future<List<MassifResult>> searchForSentences({
required BuildContext context,
Expand Down Expand Up @@ -209,11 +236,27 @@ class MassifExampleSentencesEnhancement extends Enhancement {
}

MassifResult result = MassifResult(
text: text,
range: range,
source: source,
spans: spans,
);
text: text,
range: range,
source: source,
spans: spans,
longestExactMatch: _longestExactRangeForResult(
start: start,
term: searchTerm,
text: text,
));

/// Sort by: longest exact match -> shortest sentence
results.sort((a, b) {
/// Sort by longest subterm
int longestMatch = b.longestExactMatch.compareTo(a.longestExactMatch);

if (longestMatch != 0) {
return longestMatch;
}

return a.text.length.compareTo(b.text.length);
});

results.add(result);
}
Expand Down
2 changes: 2 additions & 0 deletions yuuna/lib/src/dictionary/dictionary_entry.dart
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ class DictionaryEntry {
@Index()
final double popularity;

/// TODO: Add deflection

/// Returns all definitions bullet pointed if multiple, and returns the
/// single definition if otherwise.
String get compactDefinitions {
Expand Down
127 changes: 96 additions & 31 deletions yuuna/lib/src/dictionary/formats/yomichan_dictionary_format.dart
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@ import 'dart:convert';
import 'dart:io';

import 'package:async_zip/async_zip.dart';
import 'package:beautiful_soup_dart/beautiful_soup.dart';
import 'package:file_picker/file_picker.dart';
import 'package:flutter/material.dart';
import 'package:html/dom.dart' as dom;
import 'package:isar/isar.dart';
import 'package:list_counter/list_counter.dart';
import 'package:path/path.dart' as path;
import 'package:recase/recase.dart';
import 'package:yuuna/dictionary.dart';
Expand Down Expand Up @@ -56,32 +54,95 @@ class YomichanFormat extends DictionaryFormat {

@override
String getCustomDefinitionText(String meaning) {
final node =
StructuredContent.processContent(jsonDecode(meaning))?.toNode();
if (node == null) {
return '';
}
final mainBuffer = StringBuffer();
final currentLineBuffer = StringBuffer();
int indentationLevel = 0;

final node = StructuredContent.processContent(jsonDecode(meaning));
final document = dom.Document.html('');
document.body?.append(node);
for (final e in document.querySelectorAll('li')) {
final css = e.bs4.findParent('ul')?.attributes['style'] ?? '';
final text = e.text;
final name = css
.split(';')
.firstWhere((e) => e.contains('list-style-type'))
.split(':')
.lastOrNull ??
'square';

final counterStyle = CounterStyleRegistry.lookup(name);
final counter = counterStyle.generateMarkerContent(0);
e.text = '$counter $text';

/// Remove tables and attributions
document.querySelectorAll('div > a, rt').forEach((e) => e.remove());

String getIndentation() {
return ' ' * indentationLevel;
}

void flushBuffer() {
if (currentLineBuffer.isNotEmpty) {
mainBuffer.writeln('${getIndentation()}${currentLineBuffer.toString().trim()}');
currentLineBuffer.clear();
}
}

/// Attempt at making plaintext as close to structured HTML as possible
/// 1. Handle lists and list items with proper indentation and bullets
/// 2. Process ruby tags without breaking lines to keep Japanese text
/// together
/// 3. Accumulate text nodes in the current line buffer
/// 4. Recursively process child nodes
/// 5. Flush the buffer at the end of block-level elements, except within
/// ruby tags
/// - Note: It flushes prematurely upon encountering highlighted span tags
/// within ruby tags, but it shouldn't be too big of a problem since
/// highlighted span tags are rare.
void processNode(dom.Node node, {bool inRuby = false}) {
if (node is dom.Element) {
if (node.localName == 'ul' || node.localName == 'ol') {
/// Start a new line for lists to maintain structure
flushBuffer();
indentationLevel++;
for (var child in node.children) {
processNode(child);
}
indentationLevel--;
} else if (node.localName == 'li') {
/// Format list items with proper indentation and bullets
flushBuffer();
currentLineBuffer.write(getIndentation());
currentLineBuffer.write(indentationLevel > 1 ? '- ' : '• ');
for (var child in node.nodes) {
processNode(child);
}
flushBuffer();
} else if (node.localName == 'ruby') {
/// Process ruby tags without breaking the line to keep Japanese text together
for (var child in node.nodes) {
processNode(child, inRuby: true);
}
} else {
/// Recursively process other elements
for (var child in node.nodes) {
processNode(child, inRuby: inRuby);
}
}
} else if (node is dom.Text) {
/// Add non-empty text to the current line
String text = node.text.trim();
if (text.isNotEmpty) {
currentLineBuffer.write(text);
}
}

/// End the current line after block-level elements, but not within ruby tags
/// This keeps inline elements together while separating block-level content
if (!inRuby && node.parent != null &&
node == node.parent!.nodes.last &&
node.parent!.localName != 'ul' &&
node.parent!.localName != 'li' &&
node.parent!.localName != 'ruby') {
flushBuffer();
}
}

/// Process the entire body to generate the full definition text
for (var child in document.body!.nodes) {
processNode(child);
}
document.querySelectorAll('table').map((e) => e.remove());
final html = document.body?.innerHtml ?? '';

return BeautifulSoup(html).getText(separator: '\n');
/// Trim any extra whitespace from the final output
return mainBuffer.toString().trim();
}

/// Recursively get HTML for a structured content definition.
Expand Down Expand Up @@ -269,10 +330,12 @@ void prepareEntriesYomichanFormat({
isar.dictionaryHeadings.putSync(heading);

n++;
params.send(t.import_write_entry(
count: n,
total: total,
));
if (n % 1000 == 0) {
params.send(t.import_write_entry(
count: n,
total: total,
));
}
}
} else if (filename.startsWith('kanji_bank')) {
List<dynamic> items = jsonDecode(file.readAsStringSync());
Expand Down Expand Up @@ -341,10 +404,12 @@ void prepareEntriesYomichanFormat({
isar.dictionaryHeadings.putSync(heading);

n++;
params.send(t.import_write_entry(
count: n,
total: total,
));
if (n % 1000 == 0) {
params.send(t.import_write_entry(
count: n,
total: total,
));
}
}
}
}
Expand Down
Loading