Skip to content
This repository has been archived by the owner on Jan 13, 2023. It is now read-only.

A bit more feature overhaul and bug fixes #31

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ proguard/

#Other
assets/dict.properties
t9build.properties
t9build.properties
*.keystore
4 changes: 3 additions & 1 deletion AndroidManifest.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
android:versionCode="4"
android:versionName="git" >

<uses-sdk android:minSdkVersion="8" />
<uses-sdk android:minSdkVersion="11" />
<uses-permission android:name="android.permission.FOREGROUND_SERVICE" />
<uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
<uses-configuration
android:reqFiveWayNav="true"
Expand All @@ -18,6 +19,7 @@
/>
<application
android:allowBackup="false"
android:requestLegacyExternalStorage="true"
android:icon="@drawable/ic_launcher"
android:label="@string/ime_name"
android:theme="@style/AppTheme" >
Expand Down
40 changes: 40 additions & 0 deletions NewDicts/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
import msgpack
import gzip
import sys
import pdb
import codecs
import unicodedata

filename = sys.argv[1]
langno = int(sys.argv[2]) #english=1, russian=2
outfile = sys.argv[3]
border = int(sys.argv[4])

with gzip.open(filename, 'rb') as infile:
data = msgpack.load(infile, raw=False)
header = data[0]
if (
not isinstance(header, dict) or header.get('format') != 'cB'
or header.get('version') != 1
):
raise ValueError("Unexpected header: %r" % header)
dict_data = data[1:]
buckets = len(dict_data)
with codecs.open(outfile, "w", "utf-8") as outfile:
for bucket_no,bucket in enumerate(dict_data):
if len(bucket) == 0:
continue
wfreq = buckets - bucket_no
if wfreq < border:
break
for word in bucket:
word = unicodedata.normalize('NFKD', word.lower())
if " " in word:
print("spaces not allowed")
continue
elif any(char.isdigit() for char in word):# or any(unicodedata.category(c) not in ["Lu", "Ll", "Pc", "Pd", "Po"] for c in word): # http://www.unicode.org/reports/tr44/tr44-6.html#General_Category_Values
print("Weird:", word)
continue
else:
outfile.write(word+u" "+str(wfreq)+u" "+str(langno)+u"\n")
Binary file added NewDicts/large_en.msgpack.gz
Binary file not shown.
Binary file added NewDicts/large_ru.msgpack.gz
Binary file not shown.
6 changes: 6 additions & 0 deletions NewDicts/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Simple English+Russian preheated backup relatively usable from the get-go

Generated from data by LuminosoInsight:
https://github.com/LuminosoInsight/wordfreq/tree/02c3cbe3fb13fd133fb602997aa30ccc59c24c24/wordfreq/data

Then manually edited to remove error values
Loading