polm · polm · Jul 3, 2024 · Jul 3, 2024
diff --git a/cutlet/cutlet.py b/cutlet/cutlet.py
@@ -45,7 +45,7 @@ def has_foreign_lemma(word):
  if not '-' in lemma:
  return False
 
- cand = lemma.split('-')[-1]
+ cand = lemma.split('-', 1)[-1]
  # NOTE: some words have 外国 instead of a foreign spelling. ジル
  # (Jill?) is an example. Unclear why this is the case.
  # There are other hyphenated lemmas, like 私-代名詞.
@@ -257,6 +257,8 @@ def romaji_tokens(self, words, capitalize=True, title=False):
  if nw and nw.feature.pos1 in ('補助記号', '接尾辞'): continue
  # special case for half-width commas
  if nw and nw.surface == ',': continue
+ # special case for prefixes
+ if foreign and roma[-1] == "-": continue
  # 思えば -> omoeba
  if nw and nw.feature.pos2 in ('接続助詞'): continue
  # 333 -> 333 ; this should probably be handled in mecab
@@ -348,7 +350,7 @@ def romaji_word(self, word):
  elif (self.use_foreign_spelling and
  has_foreign_lemma(word)):
  # this is a foreign word with known spelling
- return word.feature.lemma.split('-')[-1]
+ return word.feature.lemma.split('-', 1)[-1]
  elif word.feature.kana:
  # for known words
  kana = jaconv.kata2hira(word.feature.kana)

diff --git a/cutlet/test/test_basic.py b/cutlet/test/test_basic.py
@@ -101,6 +101,9 @@
  ("くヽる", "Ku ru"),
  ("今度クヾペへ行こう", "Kondo kugupe e ikou"), # made up word
  ("彁々", "?"),
+ # prefixes, see #56
+ ("ビオハザード", "Bio-hazard"),
+ ("イントラワード", "Intra-word"),
 ]
 
 SENTENCES_KUNREI = [