-
Notifications
You must be signed in to change notification settings - Fork 0
/
jcooc.py
170 lines (161 loc) · 7.56 KB
/
jcooc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python
# -*- coding: utf-8 -*-
###########################################################################
# #
# (c) Christian Baden #
# The Hebrew University of Jerusalem, Israel, 2016 #
# #
# Please cite as: #
# Baden, Christian & Stalpouskaya, Katsiaryna (2015). Common #
# methodological framework: Content analysis. A mixed-methods #
# strategy for comparatively, diachronically analyzing conflict #
# discourse. INFOCORE Working Paper 2015/10. #
# http://www.infocore.eu/results/ #
# #
# This file is part of JAmCAT, the Jerusalem AmCAT Server: #
# http://jamcat.mscc.huji.ac.il #
# #
# JCOOC is the JAmCAT cooccurrences script, which operates upon the #
# results file generated by the coding script JCODE. #
# It interacts with the JAmCAT server using the AmCAT API. #
# #
# JAmCAT and the JCODE/JAMCODE coding script have been developed by #
# INFOCORE (In)Forming Conflict Prevention, Response and Resolution: #
# The Role of Media in Violent Conflict #
# Funded by the European Union FP7 (Cooperation), Grant Nr. 613308 #
# http://www.infocore.eu/ #
# and #
# RECORD Frame Justification and Resonance in Conflict-Related Discourse #
# Funded by the European Union FP7 (Marie Curie), Grant Nr. 627682 #
# http://www.frame-resonance.eu/ #
# #
# JAmCAT and AmCAT are free software: you can redistribute it and/or #
# modify it under the terms of the GNU Lesser General Public License as #
# published by the Free Software Foundation, either version 3 of the #
# License, or (at your option) any later version. #
# #
# Both are distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public #
# License for more details. #
# #
###########################################################################
# JCOOC uses the results file generated by JCODE to determine weighted cooccurrences
# between all recognized concepts, following two complementary logics:
# 1) it uses a windowed, syntax-sensitive algorithm that weights cooccurrences
# within a maximum word distance, counting syntactic boundaries as multiple words
# (JCODE treats commas, colons, semicola as one; periods, question and exclamation
# marks as three; and paragraph breaks as five words toward the window size).
# Cooccurrence weights are discounted by cubic distance within the window size, and
# added up across all unique instances within a text.
# 2) it constitutes cooccurrences between concepts included in the document title
# and subtitle, and all concepts in the following text, weighted by the squareroot of
# the frequency of that concept divided by the squareroot of the number of all
# concepts recognized in that text.
#
# CALL AS:
# jcooc.py <JAmCAT project id> <set id> <window width> <suffix(optional)>
#
# Where:
# <JAmCAT project id> and <set id> specify a document set stored on the JAmCAT server
# (JCOOC operates on the results file, so if the results file has been renamed, the
# command syntax needs to refer to the new names).
# <window width> is the size of the window within which cooccurrences are coded
# <suffix> is an optional suffix that helps distinguishing different versions of
# edited results files (e.g., if the results file has been renamed to results_1_17_a.txt,
# the command 'jcooc.py 1 17 <window> _a' identifies the correct file)
#
# OUTPUT:
# cooccurrences_<JAmCAT project id>_<set id>.txt
# file contents: <document id>,<concept id>,<concept id>,<weight>
# cooccurrences are undirected, all cooccurrences are listed from concepts with
# smaller to concepts with larger id, loops are omitted
import sys
import string
from datetime import date, datetime, timedelta
project=sys.argv[1]
aset=sys.argv[2]
distance=int(sys.argv[3])
suffix=''
try:
suffix=sys.argv[4]
except:
print '',
print 'determining cooccurrences...'
i=open('results_'+project+'_'+aset+suffix+'.txt')
o=open('cooccurrences_'+project+'_'+aset+suffix+'.txt','wb')
textid='0'
items=[]
for l in i:
r=string.split(string.strip(l),',')
if r[0]==textid:
items.append(r[1:])
else:
tl=0
cooc=[]
coocs=[]
concepts=[]
titleconcepts=[]
for e in items:
if 't' in e[0]:
tl=int(e[0][1:])
titleconcepts.append([int(e[0][1:]),e[1]])
elif 's' in e[0]:
tl=tl+10
titleconcepts.append([int(e[0][1:])+tl,e[1]])
else:
concepts.append([int(e[0][1:]),e[1]])
textconcepts=[]
for c in range(len(concepts)):
textconcepts.append(concepts[c][1])
outofrange=0
for d in range(len(concepts)):
if c<d:
if outofrange==0:
if concepts[d][0]-concepts[c][0]<distance:
if concepts[c][1]!=concepts[d][1]:
co=[concepts[c][1],concepts[d][1]]
co.sort()
cos=co[0]+co[1]
if cos in coocs:
cooc[coocs.index(cos)][3]=cooc[coocs.index(cos)][3]+(distance**3.0-(concepts[d][0]-concepts[c][0]-1)**3.0)/distance**3.0
else:
coocs.append(cos)
cooc.append([textid]+co+[(distance**3.0-(concepts[d][0]-concepts[c][0]-1)**3.0)/distance**3.0])
else:
outofrange=1
for c in range(len(titleconcepts)):
outofrange=0
for d in range(len(titleconcepts)):
if c<d:
if outofrange==0:
if titleconcepts[d][0]-titleconcepts[c][0]<distance:
if titleconcepts[c][1]!=titleconcepts[d][1]:
co=[titleconcepts[c][1],titleconcepts[d][1]]
co.sort()
cos=co[0]+co[1]
if cos in coocs:
cooc[coocs.index(cos)][3]=cooc[coocs.index(cos)][3]+(distance**3.0-(titleconcepts[d][0]-titleconcepts[c][0]-1)**3.0)/distance**3.0
else:
coocs.append(cos)
cooc.append([textid]+co+[(distance**3.0-(titleconcepts[d][0]-titleconcepts[c][0]-1)**3.0)/distance**3.0])
else:
outofrange=1
tconcepts=set(textconcepts)
for t in tconcepts:
if titleconcepts[c][1]!=t:
co=[titleconcepts[c][1],t]
co.sort()
cos=co[0]+co[1]
if cos in coocs:
cooc[coocs.index(cos)][3]=cooc[coocs.index(cos)][3]+textconcepts.count(t)**0.5/len(textconcepts)**0.5
else:
coocs.append(cos)
cooc.append([textid]+co+[textconcepts.count(t)**0.5/len(textconcepts)**0.5])
for co in cooc:
o.write(co[0]+','+co[1]+','+co[2]+',%.3f\n' % co[3])
print '.',
textid=r[0]
items=[r[1:]]
i.close()
o.close()