forked from antirez/smaz
-
Notifications
You must be signed in to change notification settings - Fork 8
/
smac.c
330 lines (267 loc) · 10.4 KB
/
smac.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
/*
Copyright (C) 2012 Paul Gardner-Stephen
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
/*
Compress short strings using english letter, bigraph, trigraph and quadgraph
frequencies.
This part of the process only cares about lower-case letter sequences. Encoding
of word breaks, case changes and non-letter characters will be dealt with by
separate layers.
Interpolative coding is probably a good choice for a component in those higher
layers, as it will allow efficient encoding of word break positions and other
items that are possibly "clumpy"
*/
#include <stdio.h>
#include <strings.h>
#include <math.h>
#include <stdlib.h>
#include <unistd.h>
#include <ctype.h>
#include "arithmetic.h"
#include "charset.h"
#include "packed_stats.h"
#include "smac.h"
#include "unicode.h"
int encodeLCAlphaSpace(range_coder *c,unsigned short *s,int len,stats_handle *h,
double *entropyLog);
int encodeNonAlpha(range_coder *c,unsigned short *s,int len);
int stripNonAlpha(unsigned short *in,int in_len,
unsigned short *out,int *out_len);
int stripCase(unsigned short *in,int len,unsigned short *out);
int mungeCase(unsigned short *m,int len);
int encodeCaseModel1(range_coder *c,unsigned short *line,int len,stats_handle *h);
int decodeNonAlpha(range_coder *c,int nonAlphaPositions[],
unsigned char nonAlphaValues[],int *nonAlphaCount,
int messageLength);
int decodeCaseModel1(range_coder *c,unsigned short *line,int len,stats_handle *h);
int decodeLCAlphaSpace(range_coder *c,unsigned short *s,int length,stats_handle *h,
double *entropyLog);
int decodePackedASCII(range_coder *c, unsigned char *m,int encodedLength);
int encodePackedASCII(range_coder *c,unsigned char *m);
unsigned int probPackedASCII=0.05*0xffffff;
int stats3_decompress_bits(range_coder *c,unsigned char m[1025],int *len_out,
stats_handle *h,double *entropyLog)
{
int i;
*len_out=0;
/* Check if message is encoded naturally */
int b7=range_decode_equiprobable(c,2);
int b6=range_decode_equiprobable(c,2);
int notRawASCII=0;
if (b7&&(!b6)) notRawASCII=1;
if (notRawASCII==0) {
/* raw bytes -- copy from input to output.
But use range_decode_equiprobable() so that we can decode from non-byte
boundaries. We now include a null byte to terminate the string and make
the decoding definitive.
*/
// Reconstitute first byte
unsigned char b5to0 = range_decode_equiprobable(c,64);
b5to0|=(b6<<6);
b5to0|=(b7<<7);
m[0]=b5to0;
// printf("Read byte 0x%02x\n",m[0]);
// Also stop decoding if there are too many bytes (2K should be a reasonable
// limit).
for(i=1;m[i-1]&&(i<2048);i++) {
int r=range_decode_equiprobable(c,256);
// printf("Read byte 0x%02x\n",r);
// ... or if we hit the end of the compressed bit stream.
if (r==-1 || r==0xFF) break;
else m[i]=r;
}
m[i]=0;
*len_out=i-1;
return 0;
}
int notPackedASCII=range_decode_symbol(c,&probPackedASCII,2);
int encodedLength=range_decode_symbol(c,(unsigned int *)h->messagelengths,1024);
for(i=0;i<encodedLength;i++) m[i]='?';
m[i]=0;
if (notPackedASCII==0) {
/* packed ASCII -- copy from input to output */
// printf("decoding packed ASCII\n");
decodePackedASCII(c,m,encodedLength);
*len_out=encodedLength;
return 0;
}
unsigned char nonAlphaValues[1024];
int nonAlphaPositions[1024];
int nonAlphaCount=0;
decodeNonAlpha(c,nonAlphaPositions,nonAlphaValues,&nonAlphaCount,encodedLength);
int alphaCount=encodedLength-nonAlphaCount;
// printf("message contains %d non-alpha characters, %d alpha chars.\n",nonAlphaCount,alphaCount);
unsigned short lowerCaseAlphaChars[1025];
decodeLCAlphaSpace(c,lowerCaseAlphaChars,alphaCount,h,entropyLog);
decodeCaseModel1(c,lowerCaseAlphaChars,alphaCount,h);
mungeCase(lowerCaseAlphaChars,alphaCount);
/* reintegrate alpha and non-alpha characters */
int nonAlphaPointer=0;
int alphaPointer=0;
unsigned short m16[1025];
for(i=0;i<(alphaCount+nonAlphaCount);i++)
{
if (nonAlphaPointer<nonAlphaCount
&&nonAlphaPositions[nonAlphaPointer]==i) {
m16[i]=nonAlphaValues[nonAlphaPointer++];
} else {
m16[i]=lowerCaseAlphaChars[alphaPointer++];
}
}
utf16toutf8(m16,i,m,len_out);
m[*len_out]=0;
// fprintf(stderr,"m='%s', len=%d\n",m,*len_out);
return 0;
}
int stats3_decompress(unsigned char *in,int inlen,unsigned char *out, int *outlen,
stats_handle *h)
{
range_coder *c=range_new_coder(inlen*2);
bcopy(in,c->bit_stream,inlen);
c->bit_stream_length=inlen*8;
c->bits_used=0;
c->low=0;
c->high=0xffffffff;
range_decode_prefetch(c);
if (stats3_decompress_bits(c,out,outlen,h,NULL)) {
range_coder_free(c);
return -1;
}
range_coder_free(c);
return 0;
}
int stats3_compress_radix_append(range_coder *c,unsigned char *m_in,int m_in_len,
stats_handle *h,double *entropyLog)
{
range_encode_equiprobable(c,2,1); // not raw ASCII
range_encode_equiprobable(c,2,0);
range_encode_symbol(c,&probPackedASCII,2,0); // is packed ASCII
range_encode_symbol(c,(unsigned int *)h->messagelengths,1024,m_in_len);
return encodePackedASCII(c,m_in);
}
int stats3_compress_model1_append(range_coder *c,unsigned char *m_in,int m_in_len,
stats_handle *h,double *entropyLog)
{
int len;
unsigned short utf16[1024];
unsigned short alpha[1024]; // message with all non alpha/spaces removed
unsigned short lcalpha[1024]; // message with all alpha chars folded to lower-case
// Convert UTF8 input string to UTF16 for handling
if (utf8toutf16(m_in,m_in_len,utf16,&len)) return -1;
/* Use model instead of just packed ASCII.
We use %10x as the first three bits to indicate compressed message.
This corresponds to a UTF8 continuation byte, which is never allowed at
the start of a string, and so we can use that disallowed state to
indicate whether a message is compressed or not.
*/
range_encode_equiprobable(c,2,1);
range_encode_equiprobable(c,2,0);
range_encode_symbol(c,&probPackedASCII,2,1); // not packed ASCII
// printf("%f bits to encode model\n",c->entropy);
total_model_bits+=c->entropy;
double lastEntropy=c->entropy;
/* Encode length of message */
range_encode_symbol(c,(unsigned int *)h->messagelengths,1024,len);
// printf("%f bits to encode length\n",c->entropy-lastEntropy);
total_length_bits+=c->entropy-lastEntropy;
lastEntropy=c->entropy;
/* encode any non-ASCII characters */
encodeNonAlpha(c,utf16,len);
int alpha_len=0;
stripNonAlpha(utf16,len,alpha,&alpha_len);
// printf("%f bits (%d emitted) to encode non-alpha\n",c->entropy-lastEntropy,c->bits_used);
total_nonalpha_bits+=c->entropy-lastEntropy;
lastEntropy=c->entropy;
/* compress lower-caseified version of message */
stripCase(alpha,alpha_len,lcalpha);
encodeLCAlphaSpace(c,lcalpha,alpha_len,h,entropyLog);
// printf("%f bits (%d emitted) to encode chars\n",c->entropy-lastEntropy,c->bits_used);
total_alpha_bits+=c->entropy-lastEntropy;
lastEntropy=c->entropy;
/* case must be encoded after symbols, so we know how many
letters and where word breaks are.
*/
mungeCase(alpha,alpha_len);
encodeCaseModel1(c,alpha,alpha_len,h);
// printf("%f bits (%d emitted) to encode case\n",c->entropy-lastEntropy,c->bits_used);
total_case_bits+=c->entropy-lastEntropy;
return 0;
}
int stats3_compress_uncompressed_append(range_coder *c,unsigned char *m_in,int m_in_len,
stats_handle *h,double *entropyLog)
{
// As any of these encodes might trigger a rescale,
// we need to match the same order as decoding
// First the two ascii bit flags;
range_encode_equiprobable(c, 2, (m_in[0]&0x80) >> 7);
range_encode_equiprobable(c, 2, (m_in[0]&0x40) >> 6);
// Then the remainder of the first byte
range_encode_equiprobable(c, 64, m_in[0]&0x3f);
// Then encode the rest byte by byte
unsigned i;
for (i = 1; i < m_in_len; i++)
range_encode_equiprobable(c,256,m_in[i]);
// Add $FF byte to terminate
range_encode_equiprobable(c,256,255);
return 0;
}
int stats3_compress_append(range_coder *c,unsigned char *m_in,int m_in_len,
stats_handle *h,double *entropyLog)
{
int b1,b2,b3;
/* Try the three sub-models to see which performs best. */
// Variable depth model
range_coder *t1=range_new_coder(1024);
stats3_compress_model1_append(t1,m_in,m_in_len,h,entropyLog);
range_conclude(t1); b1=t1->bits_used; range_coder_free(t1);
// Packed ascii (only if there are no non-ascii chars)
range_coder *t2=range_new_coder(1024);
if (stats3_compress_radix_append(t2,m_in,m_in_len,h,entropyLog))
b2=999999;
else { range_conclude(t2); b2=t2->bits_used; }
range_coder_free(t2);
// Unpacked (only if the first character <= 127)
b3=(m_in_len+1)*8; // one extra character for null termination
b3=999999;
// Compare the results and encode accordingly
if (b1<b2&&b1<b3)
return stats3_compress_model1_append(c,m_in,m_in_len,h,entropyLog);
else if (b2<b3||(m_in[0]&0x80))
return stats3_compress_radix_append(c,m_in,m_in_len,h,entropyLog);
else
return stats3_compress_uncompressed_append(c,m_in,m_in_len,h,entropyLog);
}
int stats3_compress_bits(range_coder *c,unsigned char *m_in,int m_in_len,
stats_handle *h,double *entropyLog)
{
if (stats3_compress_append(c,m_in,m_in_len,h,entropyLog)) return -1;
range_conclude(c);
// printf("%d bits actually used after concluding.\n",c->bits_used);
total_finalisation_bits+=c->bits_used-c->entropy;
return 0;
}
int stats3_compress(unsigned char *in,int inlen,unsigned char *out, int *outlen,stats_handle *h)
{
range_coder *c=range_new_coder(inlen*2);
if (stats3_compress_bits(c,in,inlen,h,NULL)) {
range_coder_free(c);
return -1;
}
range_conclude(c);
*outlen=c->bits_used>>3;
if (c->bits_used&7) (*outlen)++;
bcopy(c->bit_stream,out,*outlen);
range_coder_free(c);
return 0;
}