Useful generation scripts and precomputed LUTs useful for performing frequency analysis on English text.
Whenever writing some frequency analysis code, I am always frustrated by the lack of code-friendly copy-paste-able tables.
Note: The tables below are a derivative work of the corpus, Alice In Wonderland, which is licensed under the Project Gutenberg License. The license terms are rather permissive, but make sure you read it before incorporating these tables into your code. My table generation code is licensed under the MIT license, so feel free to run it on your own corpus.
The two data structures below represent the same data in different formats.
byte_freqs = [
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.022298, 0.000000, 0.000000, 0.022298, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.166921, 0.002692, 0.000806, 0.000006, 0.000012, 0.000006, 0.000000, 0.017219,
0.000454, 0.000454, 0.000525, 0.000000, 0.015315, 0.004441, 0.007198, 0.000143,
0.000131, 0.000406, 0.000072, 0.000078, 0.000060, 0.000078, 0.000042, 0.000036,
0.000060, 0.000066, 0.001528, 0.001158, 0.000000, 0.000000, 0.000000, 0.001206,
0.000012, 0.004309, 0.000746, 0.001116, 0.001444, 0.001868, 0.000800, 0.001152,
0.001844, 0.004972, 0.000078, 0.000525, 0.000943, 0.001325, 0.001086, 0.001397,
0.001027, 0.000507, 0.001265, 0.001725, 0.003408, 0.000663, 0.000310, 0.001528,
0.000036, 0.000848, 0.000006, 0.000024, 0.000000, 0.000024, 0.000000, 0.000024,
0.000000, 0.054212, 0.009675, 0.016813, 0.031203, 0.090035, 0.013417, 0.016419,
0.045247, 0.046572, 0.001325, 0.007174, 0.030159, 0.013399, 0.046978, 0.055173,
0.010719, 0.000806, 0.038198, 0.041666, 0.069420, 0.023080, 0.005437, 0.016091,
0.001015, 0.014575, 0.000472, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000
]
char_freqs = {
'\n': 0.022298,
'\r': 0.022298,
' ': 0.166921,
'!': 0.002692,
'"': 0.000806,
'#': 0.000006,
'$': 0.000012,
'%': 0.000006,
"'": 0.017219,
'(': 0.000454,
')': 0.000454,
'*': 0.000525,
',': 0.015315,
'-': 0.004441,
'.': 0.007198,
'/': 0.000143,
'0': 0.000131,
'1': 0.000406,
'2': 0.000072,
'3': 0.000078,
'4': 0.000060,
'5': 0.000078,
'6': 0.000042,
'7': 0.000036,
'8': 0.000060,
'9': 0.000066,
':': 0.001528,
';': 0.001158,
'?': 0.001206,
'@': 0.000012,
'A': 0.004309,
'B': 0.000746,
'C': 0.001116,
'D': 0.001444,
'E': 0.001868,
'F': 0.000800,
'G': 0.001152,
'H': 0.001844,
'I': 0.004972,
'J': 0.000078,
'K': 0.000525,
'L': 0.000943,
'M': 0.001325,
'N': 0.001086,
'O': 0.001397,
'P': 0.001027,
'Q': 0.000507,
'R': 0.001265,
'S': 0.001725,
'T': 0.003408,
'U': 0.000663,
'V': 0.000310,
'W': 0.001528,
'X': 0.000036,
'Y': 0.000848,
'Z': 0.000006,
'[': 0.000024,
']': 0.000024,
'_': 0.000024,
'a': 0.054212,
'b': 0.009675,
'c': 0.016813,
'd': 0.031203,
'e': 0.090035,
'f': 0.013417,
'g': 0.016419,
'h': 0.045247,
'i': 0.046572,
'j': 0.001325,
'k': 0.007174,
'l': 0.030159,
'm': 0.013399,
'n': 0.046978,
'o': 0.055173,
'p': 0.010719,
'q': 0.000806,
'r': 0.038198,
's': 0.041666,
't': 0.069420,
'u': 0.023080,
'v': 0.005437,
'w': 0.016091,
'x': 0.001015,
'y': 0.014575,
'z': 0.000472
}