process_pur.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os, zipfile, sys
from BeautifulSoup import BeautifulSoup
from lxml import etree
import time
from StringIO import StringIO
import csv, os.path
import urllib2, json
import  time

OTYPE_URI = "http://zakupki.gov.ru/oos/types/1"
EXPORT_URI = "http://zakupki.gov.ru/oos/export/1"

TAGMAP_KEYS = {'id' : 't:id', "placer_fullname" : "t:order/t:placer/t:fullName",'placer_regnum' : "t:order/t:placer/t:regNum",
               'publishDate' : 't:publishDate',
               }
LOT_KEYS = {"maxPrice" : "t:customerRequirements/t:customerRequirement/t:maxPrice"}               
TAGMAP_SKEYS = ['id', 'placer_regnum', 'placementType', 'placer_fullname', 'publishDate', 'orderName', 'product_codes', 'maxPrice']

BASE_PATH = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_PATH, "data")
NAMES_PATH = os.path.join(BASE_PATH, "names")
LATIN_CHARS = range(ord(u'a'), ord(u'z')+1)
CYR_CHARS = range(ord(u'а'), ord(u'я')+1)
NUM_CHARS = range(ord(u'0'), ord(u'9')+1)
TAB_MAP = [
    ['c', CYR_CHARS],
    ['l', LATIN_CHARS],
    ['n', NUM_CHARS],
]

DICT_WORDS = [u'по', u'и', u'с', u'для', u'или', u'от', u'в', u'на', u'«в»', u'«с»', u'зато',
              u'фку', u'коми', u'фбу', u'фгку',
              u'№3»', u'№№', u'окдп',
              u'поставка', u'поставке', u'поставку', u'поставкой',
              u'выполнение', u'«выполнение', u'выполнению',
              u'оказание', u"«оказание", u'оказанич', u'оказанию',
              u'открытой', u'открытый',
              u'государственного', u'государственной', u'государственных', u'государственный',
              u'муниципального', u'муниципальных', u'муниципальный', u'муниципальном',
                u'заказ', u'заказа',
              u'федеральной', u'федеральному', u'федерации',
              u'района',
              u'проведение', u'проведению',
              u'казенного',
              u'учреждения', u'учреждение', u'учреждению',
              u'административного',
              u'министерства',
              u'бюджетного', u'бюджета',
              u'города', u'городская', u"«городская", u'городе',
              u'саратовской', u''
              u'договора',
              u'товар', u'товара', u'товаров', u'товары',
              u'запрос',
              u'область', u'областной', u'областном',
              u'башкортостан', u'самарской',
              u'обеспечения', u'обеспечению',
#              u'расходных',
              u'размещение', u'размещению',
              u'российской',
              u'республики', u'республике',
              u'москвы',
              u'рамках',
              u'приобретение', u'приобретению',
              u'предоставление', u'предоставлению',
              u'предпринимательства',
              u'заключить', u'заключения', u'заключение',
              u'услуги',
              u'осуществление', u'осуществлению',
              u'субъектов',
              u'гражданско-правового',
              u'эквивалент',
              u'соответствии',
              u'реализации',
              u'извещение',
              u'свердловской',
              u'белгородской',
              u'согласно',
              u'текущему',
              u'право',
              u'контракт', u'контракта',
              u'нужд', u'нужды',
              u'полугодие',
              u'закупка', u'закупку', u'закупки', u'закупке',
              u'котировка', u'котировке', u'котировку', u'котировкой', u'котировок',
              u'аукцион', u'аукциона', u'аукциону', u'аукционе',
              u'объявление', u'объявления', u'объявлением',
              u'конкурс', u'конкурса', u'конкурсе', u'конкурсом',
              u'ОАЭФ', u'лот', u'электронной', u'форме']

def init_keywords():
	words = []
	with open('keywords.txt', 'r') as f:
		for l in f:
			w = l.strip().decode('utf8')
			words.append(w)
	return words

# Init misspell
def init_misspell():
    all = {}
    with open('misspell_words.txt', 'r') as f:
        for l in f:
            w = l.strip().decode('utf8')
            n = len(w)
            words = all.get(n, [])
            if w not in words:
                words.append(w)
            all[n] = words
    return all

MISSPELL_WORDS = init_misspell()
DICT_WORDS = init_keywords()

# Init misspell
def init_dict():
    all = {}
    for w in DICT_WORDS:
        n = len(w)
        words = all.get(n, [])
        if w not in words:
            words.append(w)
        all[n] = words
    return all

ALLDICT_WORDS = init_dict()


def is_junk_text(text):
    text = text.replace('.', ' ').replace(',', ' ').replace(u'№', ' ').lower()
    words = text.split()
    for w in words:
        n = len(w)
        if n not in ALLDICT_WORDS.keys(): return False
        if w not in ALLDICT_WORDS[n]:
            if len(w) > 1 and not w.isdigit():
                return 10
    return 0

def is_single_word(text):
    return len(text.split()) == 1

def is_double_word(text):
    return len(text.split()) == 2


PAT_MAP = [
#    ['clclcl', 8],
#    ['clclclc', 8],
#    ['lclclc', 8],
#    ['lclclcl', 8],
#    ['clclc', 5],
#    ['lclcl', 5],
#    ['clc', 3],
#    ['lcl', 3],
#    ['lc', 1],
#    ['cl', 1],
    ['cncncnc', 5],
    ['cncnc', 3],
    ['ncnc', 2],
    ['cnc', 1],
    ['ncn', 1],
    ['nc', 0.0],
    ['cn', 0.0],
]

def is_latin_word(word):
    lat = False
    rus = False
    pat = ""
    is_latin = False
    n = 0
    cyrnum = 0
    for ch in word:
        v = ord(ch)
        addv = 'u'
        for key, cmap in TAB_MAP:
            if v in cmap:
                addv = key
                break
        if addv == 'c': cyrnum += 1
        if n == 0 or pat[-1] != addv:
            pat += addv
        n += 1
    result = None
    cyrshare = cyrnum * 100.0 / len(word)
    if len(pat) == len(word):
        result = pat, 0
    elif len(pat) == 1:
        result = pat, 0
    elif  cyrshare < 30:
        result = pat, 0
    else:
        for pattern, w in PAT_MAP:
            if pat == pattern:
                result = pattern, w
                break
    if result is None:
        result = pat, 0
    if result[1] > 0:
        print 'match by pat value - word: %s pattern: %s' % (word, pat)
    elif pat.find('cnc') > -1:
        print 'match by pat len word: %s pattern: %s' % (word, pat)
#    print 'Word: %s pattern: %s' % (word, pat)
    return result


#        if v in LATIN_CHARS: #lat = True
#        if v in CYR_CHARS: #rus = True
#        if lat and rus: is_latin = True
#        if
#    return is_latin

def is_word_mix(text):
    parts = text.split()
    weight = 0
    allpats = []
    for w in parts:
        pat, w = is_latin_word(w)
        weight += w
        allpats.append(pat)
    return weight, allpats

def is_spaced(text):
    n = 0
    weight = 0
    for t in text.split():
        if len(t) == 1:
            if t.isdigit():
                if n > 3: weight +=1
                n = 0
            else:
                n += 1
        else:
            if n > 3: weight +=1
            n = 0
    if n > 3: weight += 1
    return weight

def clean(text):
    return text.replace(u'\n', u' ').replace(u'\r', u' ').replace(u'\t', u'    ').strip()


def is_misspelled(text):
    text = text.replace('.', ' ').replace(',', ' ').replace(u'№', ' ').lower()
    words = text.split()
    weight = 0
    for w in words:
        n = len(w)
        if n not in MISSPELL_WORDS.keys(): continue
        if w in MISSPELL_WORDS[n]:
            weight += 1
    return weight


def mark_text(text):
    """Compact rules processor"""
    attrs = {}
    rules = []
    weight = 0
    attrs['len'] = len(text)
    text = text.replace('.', ' ').replace(',', ' ').replace(u'№', ' ').strip().lower()
    words = text.split()
    textjunk = []
    spaced = 0
    attrs['wl'] = len(words)
    attrs['junkl'] = 0
    attrs['mwords'] = []
    for w in words:
        n = len(w)
        curw = 0
        # is spaced
        if len(w) == 1:
            if w.isdigit():
                if n > 3:
                    curw +=1
                    if 'SP' not in rules: rules.append('SP')
                spaced = 0
            else:
                spaced += 1
        else:
            if spaced > 3:
                curw +=1
                if 'SP' not in rules: rules.append('SP')
            spaced = 0

        # is misspelled ?
        if n in MISSPELL_WORDS.keys():
            if w in MISSPELL_WORDS[n]:
                curw += 1
                if 'MS' not in rules: rules.append('MS')

        # is latin word
        pat, latweight = is_latin_word(w)
        if latweight > 0:
            curw += latweight
            if 'LT' not in rules: rules.append('LT')

        junk = 0
        # is this text junk
        if curw > 0:
            junk = 1
        else:
            if n in ALLDICT_WORDS.keys():
                if w in ALLDICT_WORDS[n]:
                    junk = 1
                elif len(w) < 3 or w.isdigit():
                    junk = 1
        attrs['junkl'] += junk
        if junk == 0:
            attrs['mwords'].append(w)
        weight += curw

    if spaced > 3:
        if 'SP' not in rules: rules.append('SP')
        weight += 1

    isjunk = attrs['wl'] == attrs['junkl']
    attrs['junksh'] = attrs['junkl'] * 100.0 / attrs['wl'] if attrs['wl'] > 0 else 0
#    for junk in textjunk:
#        if not junk: isjunk = False

    if isjunk:
        weight += 10
        rules.append('JU')
    return weight, rules, attrs


class DataProcessor:
    def __init__(self):
        #rules = []
        #rules.append(["SP", is_spaced, open('rule_spaced.csv', 'w')])
#        rules.append(["SW", is_single_word, open('rule_singlew.csv', 'w')])
#        rules.append(["DW", is_double_word, open('rule_doublew.csv', 'w')])
        #rules.append(["JU", is_junk_text, open('rule_junk.csv', 'w')])
        #rules.append(["LT", is_word_mix, open('rule_latin.csv', 'w')])
        #rules.append(["MS", is_misspelled, open('rule_misspell.csv', 'w')])
        #self.rules = rules
        self.allrules = open('rule_all.csv', 'w')
        pass


    def process_names_file(self, filename):
        f = open(filename, 'r')
        n = 0
        x = 0
        for l in f:
            n += 1
            if n % 3000 == 0: print x, n
            parts = l.strip().decode('utf8', 'ignore').split('\t')
#            print len(parts)
            if len(parts) < 10: continue
#            if len(parts) < 3: continue
            xmlkey = parts[0]
            id = parts[0]
            nottype = parts[1]
            custid = parts[2]
            custname = parts[3]
            orgid = parts[4]
            orgname = parts[5]
            pdate = parts[6]
            text = parts[7]
            codes = parts[8]
            max_price = parts[9]
            fullweight, rulekeys, attrs = mark_text(text)
            #            fullweight = 0
            #            rulekeys = []
#            for rule in self.rules:
#                weight = rule[1](text)
#                fullweight += weight
#                if weight > 0:
#                    s = '\t'.join([rule[0], xmlkey, str(weight), id, text]) + '\n'
#                    rule[2].write(s.encode('utf8'))
#                    rulekeys.append(rule[0])
            if fullweight > 0:
                x += 1
                s = '\t'.join([str(fullweight), ','.join(rulekeys), id, nottype, custid, custname, orgid, orgname, nottype, pdate, codes, max_price, text]) + '\n'
                self.allrules.write(s.encode('utf8'))
            if text.lower().find(u'поставка товара') > -1:
                print '-', text
                time.sleep(0.5)
#            if attrs['junksh'] > 99:	
#                print '-', text
#                time.sleep(0.5)
#            print attrs['junksh']
#            if attrs['junksh'] > 50 and attrs['junksh'] < 100:
                #print '-', attrs['junksh'], '|'.join(attrs['mwords'])
#                print '-', text
#                time.sleep(0.5)

        return n


    def extract_notif(self):
        total = 0
        print BASE_PATH
        print DATA_PATH
        n = self.process_names_file("allnames.csv")
        return
        filenames = os.listdir(NAMES_PATH)
        for filename in filenames:
            n = self.process_names_file("allnames.csv")
#            for rule in self.rules:
#                rule[2].flush()
            total += n
            print filename, n, total
#        for rule in self.rules:
#            rule[2].close()
        self.allrules.close()


    def extract_words_file(self, wordsf, filename):
        f = open(filename, 'r')
        n = 0
        for l in f:
            parts = l.strip().decode('utf8', 'ignore').split('\t')
            if len(parts) < 3: continue
            text = parts[2]
            words = text.split()
            cyr = True
            for w in words:
                n += 1
                w = w + u'\n'
                wordsf.write(w.encode('utf8'))
        return n

    def extract_words(self):
        total = 0
        wordsf = open('words.txt', 'w')
        filenames = os.listdir(NAMES_PATH)
        for filename in filenames:
            n = self.extract_words_file(wordsf, os.path.join(NAMES_PATH, filename))
            total += n
            print filename, n, total

    def extract_names(self):
        total = 0
        regdirs = os.listdir(DATA_PATH)
        for regdir in regdirs:
            thepath = os.path.join(DATA_PATH, regdir)
            print 'Processing', regdir
            total += self.process_dir(regdir, thepath)
            print 'Finished', regdir, total


    def __field_value(self, tag, xp):
        values = tag.xpath(xp, namespaces={"t" : OTYPE_URI})
        if len(values) > 0:
            return values[0]
        return ""

    def __get_values(self, tag, keys):
        item = {}
        for k, v in keys.items():
            values = tag.xpath(v, namespaces={"t" : OTYPE_URI})
            if len(values) > 0:
                item[k] = unicode(values[0].text).replace('\n', " ")
            else:
                item[k] = ""
        return item

    def process_dir(self, aname, thepath):
        total = 0
        csvfile = os.path.join(NAMES_PATH, '%s.csv' %(aname))
        if os.path.exists(csvfile):
            return total
        fullf = open(csvfile, 'w')
#        s = '\t'.join(TAGMAP_SKEYS) + '\n'
#        fullf.write(s.encode('utf8'))
        for dirname, dirnames, filenames in os.walk(thepath):
            for filename in filenames:
                if filename.find('2012') == -1: continue
                if filename[0:5] != 'notif': continue
                pname = os.path.join(dirname, filename)
                try:
                    f = zipfile.ZipFile(pname)
                except zipfile.BadZipfile:
                    print "Bad zip file", filename
                    continue
                flist = f.namelist()
                for zname in flist:
                    data = f.read(zname)
                    doc = etree.parse(StringIO(data))
                    names = doc.xpath('//t:orderName', namespaces={"t" : OTYPE_URI})
                    for name in names:
                        pur = name.getparent()
                        lots = pur.xpath('t:lots/t:lot', namespaces={"t" : OTYPE_URI})
                        item = self.__get_values(pur, TAGMAP_KEYS)
                        item['orderName'] = name.text.replace('\n', " ").replace('\t', ' ')
                        item['placementType'] = pur.tag.rsplit('}', 1)[1]
                        if len(lots) == 0: continue
                        lot = lots[0]
#                        print lot
                        item.update(self.__get_values(lot, LOT_KEYS))
                        products = lot.xpath('t:products/t:product/t:code', namespaces={'t' : OTYPE_URI})
#                        print products
                        codes = []
                        for p in products:
                            codes.append(p.text)
                        item['product_codes'] = ','.join(codes)
#                        print codes
#                        print item
                        #        print dir(name)
                        text = clean(name.text)
                        arr = []
                        for k in TAGMAP_SKEYS:
                            arr.append(item[k])
                        s = '\t'.join(arr) + '\n'
                        fullf.write(s.encode('utf8'))
                    total += len(names)
                    print '-', zname, total
        return total
                    #    print "Total:", total

    def extract_short_file(self, wordsf, filename):
        f = open(filename, 'r')
        n = 0
        all = 0
        for l in f:
            l = l.strip().decode('utf8', 'ignore')
            parts = l.split('\t')
            w = parts[2]
            if len(w) < 5:
                n += 1
                l = l + u'\t%d' %(len(w)) + u'\n'
                wordsf.write(l.encode('utf8'))
            all += 1
        return n, all

    def extract_short(self):
        total = 0
        wordsf = open('short.txt', 'w')
        filenames = os.listdir(NAMES_PATH)
        for filename in filenames:
            n, all = self.extract_short_file(wordsf, os.path.join(NAMES_PATH, filename))
            share = n * 100.0 / all if all > 0 else 0
            total += n
            print filename, n, total, share


    def extract_long_file(self, wordsf, filename):
        f = open(filename, 'r')
        n = 0
        all = 0
        for l in f:
            l = l.strip().decode('utf8', 'ignore')
            parts = l.split('\t')
            words = parts[2].split()
            for w in words:
                if len(w) > 20 and w.find('-') == -1 and len(words) < 4:
                    n += 1
                    l = l + u'\t%d' %(len(w)) + u'\n'
                    wordsf.write(l.encode('utf8'))
                    break
            all += 1
        return n, all

    def extract_long(self):
        total = 0
        wordsf = open('long.txt', 'w')
        filenames = os.listdir(NAMES_PATH)
        for filename in filenames:
            n, all = self.extract_long_file(wordsf, os.path.join(NAMES_PATH, filename))
            share = n * 100.0 / all if all > 0 else 0
            total += n
            print filename, n, total, share


if __name__ == "__main__":
    processor = DataProcessor()
#    processor.extract_names()
    processor.extract_notif()
#    processor.extract_words()
#    processor.extract_short()
#    processor.extract_long()