Skip to content

Commit

Permalink
Merge pull request #46 from ausgerechnet/v0.10.2
Browse files Browse the repository at this point in the history
V0.10.2
  • Loading branch information
ausgerechnet authored Mar 6, 2022
2 parents 626caeb + a8989fc commit f6296ce
Show file tree
Hide file tree
Showing 23 changed files with 756 additions and 447 deletions.
2 changes: 1 addition & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ sphinx = "*"
enthought-sphinx-theme = "*"

[packages]
association-measures = ">=0.1.6"
association-measures = ">=0.2.0"
pandas = ">=1.2.0"
numexpr = ">=2.7.1"
Bottleneck = ">=1.3.2"
Expand Down
91 changes: 40 additions & 51 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 17 additions & 1 deletion ccc/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,27 @@ def __init__(self, path=None):
directory = os.path.dirname(path)
os.makedirs(directory, exist_ok=True)

def delete(self, identifier):

if self.path is None:
logger.info('cache: no path')
return

if isinstance(identifier, str):
key = identifier
else:
key = generate_idx(identifier)

with shelve.open(self.path) as db:
if key in db.keys():
logger.info('cache: deleting object "%s"' % key)
del db[key]

def get(self, identifier):

if self.path is None:
logger.info('cache: no path')
return None
return

if isinstance(identifier, str):
key = identifier
Expand Down
25 changes: 13 additions & 12 deletions ccc/collocates.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pandas import DataFrame

# part of module
from .counts import score_counts_signature
from .counts import score_counts
from .utils import node2cotext

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -72,15 +72,12 @@ def count(self, window):
logger.info('slicing window %d' % window)
relevant = self.df_cooc.loc[abs(self.df_cooc['offset']) <= window]

# number of possible occurrence positions within window
f1 = len(relevant)

# frequency counts
f = self.corpus.counts.cpos(relevant['cpos'], self.p_query)

return f, f1
return f

def show(self, window=5, order='O11', cut_off=100, ams=None,
def show(self, window=5, order='log_likelihood', cut_off=100, ams=None,
min_freq=2, frequencies=True, flags=None,
marginals='corpus'):

Expand All @@ -89,8 +86,10 @@ def show(self, window=5, order='O11', cut_off=100, ams=None,
logger.error("nothing to show")
return DataFrame()

# get subcorpus frequencies
f, f1 = self.count(window)
# get window counts and apply min freq
f = self.count(window).rename(columns={'freq': 'f'})
f1 = f['f'].sum()
f = f.loc[f['f'] >= min_freq]

# get reference frequencies
if isinstance(marginals, str):
Expand All @@ -112,11 +111,13 @@ def show(self, window=5, order='O11', cut_off=100, ams=None,
f2 = f2.fillna(0, downcast='infer')
f2['f2'] = f2['marginal'] - f2['in_nodes']

# create dataframe
df = f2.join(f)
df['f1'] = f1
df['N'] = N

# score
collocates = score_counts_signature(
f[['freq']], f1, f2[['f2']], N,
min_freq, order, cut_off, flags, ams, frequencies
)
collocates = score_counts(df, order, cut_off, flags, ams)

if frequencies:
# throw away anti-collocates by default
Expand Down
86 changes: 8 additions & 78 deletions ccc/counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,105 +480,35 @@ def mwus(self, cqp, queries, p_atts=None, fill_missing=True, strategy=1):
return df


def score_counts(df1, df2, R1=None, R2=None, reference='right',
min_freq=2, order='log_likelihood', cut_off=1000,
flags=None, ams=None, freq=True, digits=6):
"""calculate association measures for two frequency lists df1, df2
with respective sizes R1, R2.
:param DataFrame df1: counts per item in corpus 1
:param DataFrame df2: counts per item in corpus 2
:param int R1: number of items in df1
:param int R2: number of items in df2
:param str reference: which dataframe is the reference?
:param int min_freq: minimum number of occurrences in df1
def score_counts(df, order='log_likelihood', cut_off=1000,
flags=None, ams=None, digits=6):
"""score counts in DataFrame.
:param DataFrame df: DataFrame with reasonably-named columns, index 'item'
:param str order: association measure for sorting (in descending order)
:param int cut_off: number of items to retrieve
:param str flags: '%c' / '%d' / '%cd' (cwb-ccc algorithm)
:param list ams: association measures to calculate (None=all)
:param bool freq: include absolute and relative frequencies?
:param int digits: round dataframe
:return: scored counts
:rtype: ScoreFrame
"""

# which one should be treated as reference?
if reference == 'left':
return score_counts(df2, df1, R2, R1, reference='left',
order=order, cut_off=cut_off,
flags=flags, ams=ams, freq=freq,
digits=digits)

logger.info('creating table of association measures')

# preprocess
df1.columns = ['O11']
df2.columns = ['O21']

# get corpus sizes if necessary
R1 = df1['O11'].sum() if R1 is None else R1
R2 = df2['O21'].sum() if R2 is None else R2

# join dataframes respecting min_freq
if min_freq == 0:
df = df1.join(df2, how='outer')
else:
df1 = df1.loc[df1['O11'] >= min_freq]
df = df1.join(df2, how='left')
df = df.fillna(0, downcast='infer')

# post-processing: fold items
df = fold_df(df, flags)

# calculate association
df["O12"] = R1 - df["O11"]
df["O22"] = R2 - df["O21"]
df = measures.calculate_measures(df, freq=freq)

if freq:
# add instances per million
df['ipm'] = df['O11'] / R1 * 1000000
df['ipm_expected'] = df['E11'] / R1 * 1000000
df['ipm_reference'] = df['O21'] / R2 * 1000000
df['ipm_reference_expected'] = df['E21'] / R2 * 1000000
df = measures.score(df, measures=ams, freq=True, digits=digits)

# sort
df = df.sort_values(by=[order, 'O11', 'O12'], ascending=False)
df = df.sort_values(by=[order, 'item'], ascending=False)

# apply cut-off
df = df.head(cut_off) if cut_off is not None else df

# round
df = round(df, digits) if digits is not None else df

return df


def score_counts_signature(f, f1, f2, N, min_freq=2,
order='log_likelihood', cut_off=1000,
flags=None, ams=None, freq=True, digits=6):
"""wrapper of score_counts for input in frequency signature notation.
:param DataFrame f: co-occurrence freq. of token and node
:param int f1: number of tokens in W(node)
:param DataFrame f2: marginal freq. of tokens
:param int N: size of corpus
:return: scored counts
:rtype: ScoreFrame
"""

f.columns = ['O11']
f2.columns = ['C1']
df = f.join(f2, how='outer').fillna(0, downcast='infer')
df['O21'] = df['C1'] - df['O11']

return score_counts(
f, df[['O21']], f1, N-f1, reference='right', min_freq=min_freq,
order=order, cut_off=cut_off, flags=flags, ams=ams,
freq=freq, digits=digits
)
Loading

0 comments on commit f6296ce

Please sign in to comment.