Skip to content

Commit

Permalink
feat: support BaiduPinyin backup binary format
Browse files Browse the repository at this point in the history
Co-authored-by: stevenlele <15964380+stevenlele@users.noreply.github.com>
  • Loading branch information
nopdan and stevenlele committed Mar 21, 2024
1 parent 566ef63 commit e6ef15f
Show file tree
Hide file tree
Showing 2 changed files with 172 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/ImeWlConverterCore/ConstantString.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ public class ConstantString
public const string BAIDU_BDICT = "百度分类词库bdict";
public const string BAIDU_BCD = "百度手机词库bcd";
public const string BAIDU_PINYIN = "百度拼音";
public const string BAIDU_PINYIN_BACKUP = "百度拼音备份词库bin";
public const string QQ_PINYIN_ENG = "QQ拼音英文";
public const string QQ_SHOUJI = "QQ手机";
public const string QQ_WUBI = "QQ五笔";
Expand Down Expand Up @@ -81,6 +82,8 @@ public class ConstantString
public const string BAIDU_SHOUJI_ENG_C = "bdsje";
public const string BAIDU_BDICT_C = "bdict";
public const string BAIDU_BCD_C = "bcd";
public const string BAIDU_PINYIN_C = "bdpy";
public const string BAIDU_PINYIN_BACKUP_C = "bdpybin";
public const string QQ_SHOUJI_C = "qqsj";
public const string QQ_WUBI_C = "qqwb";
public const string TOUCH_PAL_C = "cbsj";
Expand Down Expand Up @@ -114,7 +117,6 @@ public class ConstantString
public const string RIME_USERDB_C = "rimedb";
public const string BING_PINYIN_C = "bing";
public const string LINGOES_LD2_C = "ld2";
public const string BAIDU_PINYIN_C = "bdpy";
public const string QQ_PINYIN_ENG_C = "qqpye";
public const string XIAOYA_WUBI_C = "xywb";
public const string CANGJIE_PLATFORM_C = "cjpt";
Expand Down
169 changes: 169 additions & 0 deletions src/ImeWlConverterCore/IME/BaiduPinyinBackup.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*!
* This work contains codes translated from the original work
* by stevenlele (https://github.com/studyzy/imewlconverter/issues/204#issuecomment-2011007855)
* translate to csharp by nopdan
*/

using Studyzy.IMEWLConverter.Entities;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace Studyzy.IMEWLConverter.IME
{
/// <summary>
/// 百度拼音备份词库
/// </summary>
[ComboBoxShow(ConstantString.BAIDU_PINYIN_BACKUP, ConstantString.BAIDU_PINYIN_BACKUP_C, 20)]
public class BaiduPinyinBackup : BaseImport, IWordLibraryImport
{
#region IWordLibraryImport 成员

public WordLibraryList Import(string path)
{
var wordLibraryList = new WordLibraryList();
var fs = new FileStream(path, FileMode.Open, FileAccess.Read);
// FF FE
fs.Seek(2, SeekOrigin.Begin);
// 不清楚 <cnword> 是否在 <enword> 前面,所以标记一下
var cnFlag = false;
while (fs.Position < fs.Length - 4)
{
// 每次读取两字节
var lineBytes = new List<byte>();
var bytes = new byte[2];
do
{
fs.Read(bytes, 0, 2);
// 遇到换行符结束读取
if (bytes[0] == 0x0A && bytes[1] == 0x00) break;
lineBytes.AddRange(bytes);
} while (true);
var line = Decode(lineBytes.ToArray());
var theLine = Encoding.Unicode.GetString(line);
// 忽略英文单词
if (cnFlag && (theLine == "<enword>" || theLine == "<sysusrword>")) break;
if (theLine == "<cnword>") { cnFlag = true; continue; }
// 每一行的格式
// 百度输入法(bai|du|shu|ru|fa) 2 24 1703756731 N N
var array = theLine.Split(" ");
if (array.Length < 2) continue;
var rank = Convert.ToInt32(array[1]);
// 用正则分离词组和拼音
var pattern = @"([^\(]+)\((.+)\)";
var match = Regex.Match(array[0], pattern);
if (match.Groups.Count != 3) continue;
var word = match.Groups[1].Value;
var py = match.Groups[2].Value;
var pinyin = py.Split("|");

wordLibraryList.Add(new WordLibrary
{
Rank = rank,
Word = word,
PinYin = pinyin
});
}
return wordLibraryList;
}

#endregion

#region 解码算法

private const uint MASK = 0x2D382324;
private static readonly byte[] TABLE = Encoding.ASCII.GetBytes("qogjOuCRNkfil5p4SQ3LAmxGKZTdesvB6z_YPahMI9t80rJyHW1DEwFbc7nUVX2-");
private static byte[] DECODE_TABLE;

public BaiduPinyinBackup()
{
DECODE_TABLE = new byte[256];
for (var i = 0; i < 256; i++)
{
DECODE_TABLE[i] = (byte)i;
}
for (var i = 0; i < TABLE.Length; i++)
{
DECODE_TABLE[TABLE[i]] = (byte)i;
}
}

public static byte[] Decode(byte[] data)
{
if (data.Length % 4 != 2)
throw new ArgumentException("Invalid data length");

byte base64Remainder = (byte)(data[data.Length - 2] - 65);
if (base64Remainder < 0 || base64Remainder > 2 || data[data.Length - 1] != 0)
throw new ArgumentException("Invalid padding");

byte[] newData = new byte[data.Length - 2];
for (int i = 0; i < data.Length - 2; i++)
{
newData[i] = DECODE_TABLE[data[i]];
}

var transformed = new List<byte>();
for (int i = 0; i < newData.Length - 2; i += 4)
{
byte highBits = newData[i + 3];
transformed.Add((byte)(newData[i] | (highBits & 0b110000) << 2));
transformed.Add((byte)(newData[i + 1] | (highBits & 0b1100) << 4));
transformed.Add((byte)(newData[i + 2] | (highBits & 0b11) << 6));
}

if (base64Remainder > 0)
{
for (int i = 0; i < 3 - base64Remainder; i++)
{
if (transformed[transformed.Count - 1] != 0)
throw new ArgumentException("Invalid padding");
transformed.RemoveAt(transformed.Count - 1);
}
}
var result = transformed.ToArray();


List<byte> finalResult = new List<byte>();
for (int i = 0; i < result.Length / 4 * 4; i += 4)
{
uint chunk = MASK ^ BitConverter.ToUInt32(result, i);
chunk = (chunk & 0x1FFFFFFF) << 3 | chunk >> 29;
finalResult.AddRange(BitConverter.GetBytes(chunk));
}

if (result.Length % 4 != 0)
{
byte[] bytes = result.Skip(result.Length / 4 * 4).ToArray();
int num = 0;
for (int i = 0; i < bytes.Length; i++)
{
num |= bytes[i] << (i * 8);
}
uint chunk = MASK ^ (uint)num;
finalResult.AddRange(BitConverter.GetBytes(chunk).Take(result.Length % 4));
}

return finalResult.ToArray();
}

#endregion

#region IWordLibraryImport Members

public override bool IsText
{
get { return false; }
}

#endregion

public WordLibraryList ImportLine(string line)
{
throw new Exception("百度输入法备份格式是二进制文件,不支持流转换");
}
}
}

0 comments on commit e6ef15f

Please sign in to comment.