jcooc.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

###########################################################################
#                                                                         #
#                           (c) Christian Baden                           #
#             The Hebrew University of Jerusalem, Israel, 2016            #
#                                                                         #
# Please cite as:                                                         #
#        Baden, Christian & Stalpouskaya, Katsiaryna (2015). Common       #
#        methodological framework: Content analysis. A mixed-methods      #
#        strategy for comparatively, diachronically analyzing conflict    #
#        discourse. INFOCORE Working Paper 2015/10.                       #
#        http://www.infocore.eu/results/                                  #
#                                                                         #
# This file is part of JAmCAT, the Jerusalem AmCAT Server:                #
#        http://jamcat.mscc.huji.ac.il                                    #
#                                                                         #
# JCOOC is the JAmCAT cooccurrences script, which operates upon the       #
# results file generated by the coding script JCODE.                      #
# It interacts with the JAmCAT server using the AmCAT API.                #
#                                                                         #
# JAmCAT and the JCODE/JAMCODE coding script have been developed by       #
# INFOCORE (In)Forming Conflict Prevention, Response and Resolution:      #
#        The Role of Media in Violent Conflict                            #
#        Funded by the European Union FP7 (Cooperation), Grant Nr. 613308 #
#        http://www.infocore.eu/                                          #
# and                                                                     #
# RECORD Frame Justification and Resonance in Conflict-Related Discourse  #
#        Funded by the European Union FP7 (Marie Curie), Grant Nr. 627682 #
#        http://www.frame-resonance.eu/                                   #
#                                                                         #
# JAmCAT and AmCAT are free software: you can redistribute it and/or      #
# modify it under the terms of the GNU Lesser General Public License as   #
# published by the Free Software Foundation, either version 3 of the      #
# License, or (at your option) any later version.                         #
#                                                                         #
# Both are distributed in the hope that it will be useful, but WITHOUT    #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or   #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public     #
# License for more details.                                               #
#                                                                         #
###########################################################################

# JCOOC uses the results file generated by JCODE to determine weighted cooccurrences
# between all recognized concepts, following two complementary logics:
# 1) it uses a windowed, syntax-sensitive algorithm that weights cooccurrences
# within a maximum word distance, counting syntactic boundaries as multiple words
# (JCODE treats commas, colons, semicola as one; periods, question and exclamation
# marks as three; and paragraph breaks as five words toward the window size).
# Cooccurrence weights are discounted by cubic distance within the window size, and
# added up across all unique instances within a text.
# 2) it constitutes cooccurrences between concepts included in the document title
# and subtitle, and all concepts in the following text, weighted by the squareroot of
# the frequency of that concept divided by the squareroot of the number of all
# concepts recognized in that text.
# 
# CALL AS:
# jcooc.py <JAmCAT project id> <set id> <window width> <suffix(optional)>
# 
# Where:
# <JAmCAT project id> and <set id> specify a document set stored on the JAmCAT server
# (JCOOC operates on the results file, so if the results file has been renamed, the
# command syntax needs to refer to the new names).
# <window width> is the size of the window within which cooccurrences are coded
# <suffix> is an optional suffix that helps distinguishing different versions of
# edited results files (e.g., if the results file has been renamed to results_1_17_a.txt,
# the command 'jcooc.py 1 17 <window> _a' identifies the correct file)
#
# OUTPUT:
# cooccurrences_<JAmCAT project id>_<set id>.txt
# file contents: <document id>,<concept id>,<concept id>,<weight>
# cooccurrences are undirected, all cooccurrences are listed from concepts with
# smaller to concepts with larger id, loops are omitted

import sys
import string
from datetime import date, datetime, timedelta

project=sys.argv[1]
aset=sys.argv[2]
distance=int(sys.argv[3])
suffix=''
try:
	suffix=sys.argv[4]
except:
	print '',

print 'determining cooccurrences...'

i=open('results_'+project+'_'+aset+suffix+'.txt')
o=open('cooccurrences_'+project+'_'+aset+suffix+'.txt','wb')

textid='0'
items=[]

for l in i:
	r=string.split(string.strip(l),',')
	if r[0]==textid:
		items.append(r[1:])
	else:
		tl=0
		cooc=[]
		coocs=[]
		concepts=[]
		titleconcepts=[]
		for e in items:
			if 't' in e[0]:
				tl=int(e[0][1:])
				titleconcepts.append([int(e[0][1:]),e[1]])
			elif 's' in e[0]:
				tl=tl+10
				titleconcepts.append([int(e[0][1:])+tl,e[1]])
			else:
				concepts.append([int(e[0][1:]),e[1]])
		textconcepts=[]
		for c in range(len(concepts)):
			textconcepts.append(concepts[c][1])
			outofrange=0
			for d in range(len(concepts)):
				if c<d:
					if outofrange==0:
						if concepts[d][0]-concepts[c][0]<distance:
							if concepts[c][1]!=concepts[d][1]:
								co=[concepts[c][1],concepts[d][1]]
								co.sort()
								cos=co[0]+co[1]
								if cos in coocs:
									cooc[coocs.index(cos)][3]=cooc[coocs.index(cos)][3]+(distance**3.0-(concepts[d][0]-concepts[c][0]-1)**3.0)/distance**3.0
								else:
									coocs.append(cos)
									cooc.append([textid]+co+[(distance**3.0-(concepts[d][0]-concepts[c][0]-1)**3.0)/distance**3.0])
						else:
							outofrange=1
		for c in range(len(titleconcepts)):
			outofrange=0
			for d in range(len(titleconcepts)):
				if c<d:
					if outofrange==0:
						if titleconcepts[d][0]-titleconcepts[c][0]<distance:
							if titleconcepts[c][1]!=titleconcepts[d][1]:
								co=[titleconcepts[c][1],titleconcepts[d][1]]
								co.sort()
								cos=co[0]+co[1]
								if cos in coocs:
									cooc[coocs.index(cos)][3]=cooc[coocs.index(cos)][3]+(distance**3.0-(titleconcepts[d][0]-titleconcepts[c][0]-1)**3.0)/distance**3.0
								else:
									coocs.append(cos)
									cooc.append([textid]+co+[(distance**3.0-(titleconcepts[d][0]-titleconcepts[c][0]-1)**3.0)/distance**3.0])
						else:
							outofrange=1
							
			tconcepts=set(textconcepts)
			for t in tconcepts:
				if titleconcepts[c][1]!=t:
					co=[titleconcepts[c][1],t]
					co.sort()
					cos=co[0]+co[1]
					if cos in coocs:
						cooc[coocs.index(cos)][3]=cooc[coocs.index(cos)][3]+textconcepts.count(t)**0.5/len(textconcepts)**0.5
					else:
						coocs.append(cos)
						cooc.append([textid]+co+[textconcepts.count(t)**0.5/len(textconcepts)**0.5])
		for co in cooc:
			o.write(co[0]+','+co[1]+','+co[2]+',%.3f\n' % co[3])
		print '.',
		textid=r[0]
		items=[r[1:]]
i.close()
o.close()