utils.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# System libs
import re
import os
import sys
import glob
import shutil
import filecmp
import difflib
import unicodedata

# Third party libs
import termcolor
import bibtexparser
from loguru import logger
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bparser import BibTexParser

# Local libs
import latex
import nomenclature
import config


if sys.version_info.major == 2:
    TEXT_TYPE = unicode
else:
    TEXT_TYPE = str


MONTHS = [
    'jan',
    'feb',
    'mar',
    'apr',
    'may',
    'jun',
    'jul',
    'aug',
    'sep',
    'oct',
    'nov',
    'dec',
]

MONTHS_FULL = [
    'January',
    'February',
    'March',
    'April',
    'May',
    'June',
    'July',
    'August',
    'September',
    'October',
    'November',
    'December',
]


def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')


def has_pdfs(folder):
    return (os.path.exists(os.path.join(folder, '.biblist')) or
            len(glob.glob(os.path.join(folder, "*.pdf"))) > 0)


def simratio(file1, file2):
    return difflib.SequenceMatcher(None, file1.lower(), file2.lower()).ratio()


def get_title(record):
    title = record['title']
    if 'booktitle' in record and record['ENTRYTYPE'] == 'book':
        if title:
            title = record['booktitle'] + ' - ' + title
        else:
            title = record['booktitle']
    return title


def most_similar_filename(guess, candidates):
    """
    Return the most similar filename amongst files in given folder.

    Args:
        guess (str): filename you want to find a match to.
        candidates (str or iterable): path of the folder to inspect, or sequence of candidates.

    Returns:
        A tuple (match, score), with the name of the most similar match in the
        given list, and the score of such a match.
    """
    best_score = 0.0
    best_file = ""
    if isinstance(candidates, str):
        candidates = get_pdf_list(candidates)
    for file in candidates:
        sc = simratio(guess, file)
        if sc > best_score:
            best_score = sc
            best_file = file
    return best_file, best_score


def get_pdf_list(folder):
    """
    Return the list of pdfs in a given folder.
    Additionally, if the folder contains a file named ".biblist", reads
    the content of the file as additional pdfs to process.
    """
    all_pdfs = [os.path.basename(x) for x in glob.glob(os.path.join(folder, "*.pdf"))]
    biblist_file = os.path.join(folder, '.biblist')
    if os.path.exists(biblist_file):
        with open(biblist_file, 'r') as f:
            for line in f:
                all_pdfs.append(line.rstrip())
    return sorted(all_pdfs)


def multireplace(string, replacements):
    """
    Given a string and a replacement map, it returns the replaced string.
    Taken from https://gist.github.com/bgusach/a967e0587d6e01e889fd1d776c5f3729

    Args:
        string (str): string to execute replacements on
        replacements (dict): replacement dictionary {value to find: value to replace}

    Returns:
        The replaced string.
    """

    # Place longer ones first to keep shorter substrings from matching where the
    # longer ones should take place. For instance given the replacements
    # {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'.
    substrs = sorted(replacements, key=len, reverse=True)

    # Create a big OR regex that matches any of the substrings to replace.
    regexp = re.compile('|'.join(map(re.escape, substrs)))

    # For each match, look up the new string in the replacements.
    return regexp.sub(lambda match: replacements[re.escape(match.group(0))], string)


def write_with_backup(filename, new_content, use_backup=True):
    """
    Backup the given text file, provided it differs from new_content.

    Args:
        filename (str): absolute or relative path the file to backup.
        new_content (str): new content that will be written to filename.
        use_backup (bool): whether to actually backup the file or not.
    """

    # Find new backup filename
    if use_backup:
        backup = filename + '.bak'
        if os.path.exists(backup):
            index = 1
            while os.path.exists(backup):
                # If a backuped file has similar content, no need to do a new backup
                if filecmp.cmp(filename, backup, shallow=False):
                    return
                backup = filename + '.bak.' + str(index)
                index += 1
        if os.path.exists(filename):
            if new_content == open(filename, encoding='utf-8').read():
                # If file to write is similar to the file to overwrite, do nothing
                return
            else:
                # Otherwise backup current file
                print('Backing up "' + os.path.basename(filename) + '"')
                shutil.move(filename, backup)

    # Write new content
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(new_content)


def sort_entries(db, order_entries_by):
    """
    Since it is has to sort by decreasing order using biblatex, here we sort
    entries manually.

    Args:
        db (BibDatabase): the database whose entries to sort.

    Returns:
        Nothing, but update the entries attribute of the input db.
    """
    def entry_sort_key(entry, fields):
        result = []
        for field in fields:
            if field == 'year':
                result.append(-int(entry.get(field, 0)))
            else:
                result.append(TEXT_TYPE(entry.get(field, '')).lower())
        return tuple(result)
    db.entries = sorted(db.entries, key=lambda x: entry_sort_key(x, order_entries_by))


def create_file_dict(db):
    """
    Generates a dictionary that maps a decoded file field to its index in the
    given database entry list.

    Args:
        db (BibDatabase): the database whose to index.

    Returns:
         A dictionary {filename: id} mapping decoded file fields to their index
         in the input database.
    """
    gen = ((i, x) for i, x in enumerate(db.entries) if 'file' in x)
    files = {decode_filename_field(x['file']): i for i, x in gen}
    return files


def guess_manual_files(folder, queried_db, update_queried_db=True):
    """
    Tries to guess which files in a folder correspond to entries placed in the
    `.manual.bib` file. This is useful for e.g. to avoid performing online queries
    for files which we know have a manual entry.

    If a '.manual.bib' is present, override corresponding queried entries
    The way it works is as follows:
      1. Guess the filename of each entry in `.manual.bib`
      2. Find entry in `.queried.bib` with the closest file name in its 'file' field
      3. Override with manual entry
    """
    files = create_file_dict(queried_db)
    manual_bib_path = os.path.join(folder, '.manual.bib')
    if os.path.exists(manual_bib_path):
        manual_database = read_bib_file(manual_bib_path, homogenize=True)
        for entry in manual_database.entries:
            guess = nomenclature.gen_filename(entry)
            logger.warning("Try to find a match for manual entry: {}", guess)
            queried_best_score = 0.0
            queried_best_idx = -1
            queried_best_key = None
            # Find the entry from .queried that is the most similar to the manual entry
            for key, idx in sorted(files.items()):
                sc = simratio(key, guess)
                if sc > queried_best_score:
                    queried_best_score = sc
                    queried_best_idx = idx
                    queried_best_key = key
            # Find most similar filename in the folder being processed
            match, match_score = most_similar_filename(guess, folder)
            if match_score < 0.9:
                logger.warning("Cannot find a file matching manual entry (simratio: {}).\n- Entry: {}\n- Match: {}", match_score, guess, match)
                res = None
                while res not in ['y', 'n']:
                    res = input("Use best match for this entry? [y/n]")
                if res == 'n':
                    continue
            else:
                logger.info("Found a file matching manual entry: {}", guess)
            entry['file'] = encode_filename_field(match)
            files[match] = -1
            # If best match is good enough, override queried entry with the manual one
            if update_queried_db:
                if queried_best_idx >= 0 and queried_best_score > 0.90:
                    logger.info("Found a query matching manual entry: {}", guess)
                    queried_db.entries[queried_best_idx] = entry
                elif queried_best_idx >= 0 and queried_best_score > 0.80:
                    logger.warning("Could not find a query matching manual entry (simratio: {}).\n- Entry: {}\n- Query: {}", queried_best_score, guess, queried_best_key)
                    res = None
                    while res not in ['y', 'n']:
                        res = input("Replace this query with the manual entry? [y/n]")
                    if res == 'y':
                        queried_db.entries[queried_best_idx] = entry
                    else:
                        queried_db.entries.append(entry)
                else:
                    logger.debug("Could not find a query matching manual entry: {}", guess)
                    queried_db.entries.append(entry)
    return files


def add_skip_files(folder, files):
    """
    Read the file `.skip.txt` if it exists, and skip the files it contains from online queries.
    """
    skip_path = os.path.join(folder, '.skip.txt')
    if os.path.isfile(skip_path):
        with open(skip_path, 'r', encoding='utf-8') as f:
            for x in f.read().splitlines():
                files[x] = -1


def read_bib_file(filename, homogenize=False):
    """
    Read bibtex file.

    Args:
        filename (str): path of the bibtex file.
        homogenize (bool): whether to homogenize the entries upon reading.

    Returns:
        A BibDatabase object.
    """

    # Read input bibtex file
    bibtex_str = " "
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as bibfile:
            bibtex_str = bibfile.read()
    bibtex_str += " "

    # Choose parser
    parser = None
    if homogenize:
        parser = BibTexParser(common_strings=True)
        parser.customization = nomenclature.homogenize_latex_encoding

    # Create database from string
    return bibtexparser.loads(bibtex_str, parser=parser)


def write_bib(db, order=False):
    """
    Write bibtex string.

    Args:
        db (BibDatabase): database object to dump..
        order (bool): whether to reorder entries upon writing.

    Returns:
        The dumped string.
    """

    # Custom writer
    writer = BibTexWriter()
    writer.indent = '\t'
    writer.order_entries_by = None

    # Replace month by numeric value
    for entry in db.entries:
        if 'month' in entry:
            for x in [MONTHS, MONTHS_FULL]:
                if entry['month'] in x:
                    entry['month'] = '{:02d}'.format(x.index(entry['month']) + 1)

    if order:
        # Manual sort
        order_entries_by = ('year', 'author', 'ID')
        sort_entries(db, order_entries_by)

    if not config.use_utf8_characters:
        db.entries = [nomenclature.encode_ascii_latex(entry) for entry in db.entries]

    if config.protect_uppercase:
        for entry in db.entries:
            entry["title"] = latex.protect_uppercase(entry["title"])

    # Write bib string
    return writer.write(db)


def encode_filename_field(filename):
    """
    Create an escaped bibtex field with the relative file path.

    Args:
        filename (str): absolute or relative name of the file.

    Returns:
        A string which corresponds to the escaped field to be used
    """
    assert filename.endswith(".pdf")
    return ':' + filename.replace(':', '\\:') + ':PDF'


def decode_filename_field(text):
    """
    Interpret a file field.

    Args:
        text (str): content of the field.

    Returns:
        A string which corresponds to the decoded file name.
    """
    assert text.endswith(":PDF")
    match = re.search(':(.*):PDF', text)
    assert match
    filename = match.group(1).replace('\\:', ':')
    return filename


def fix_author_field(res_bib, res_json):
    """
    Attempt to fix some defects when the author name is given in an ambiguous
    manner in the bibtex entry. To this end, it uses the matching json entry.
    Only for Crossref entries (needs the json data).
    """

    def process_pair(author_bib, author_json, msg):
        if ',' in author_bib:
            # Assume entry is correct already
            return author_bib
        elif sorted(author_json.keys()) != ['affiliation', 'family', 'given']:
            # Author entry contains extra information
            msg += termcolor.colored('W: Too much info in author json entry:', 'yellow') + '\n'
            msg += "JSON: " + str(author_json) + '\n'
            return author_bib
        elif not author_bib.endswith(author_json['family']):
            # Mismatched family name between json and bibtex
            msg += termcolor.colored('W: Potential mismatched family name in author entry:', 'yellow') + '\n'
            msg += "BIB : " + author_bib + '\n'
            msg += "JSON: " + str(author_json) + '\n'
            return author_bib
        else:
            # All good, let's remove the ambiguity
            old_name = author_json['given'] + ' ' + author_json['family']
            new_name = author_json['family'] + ", " + author_json['given']
            if old_name != author_bib or len(author_json['family'].split()) > 1:
                s = "I: Author name changed from [" + author_bib + "] to [" + new_name + "]"
                msg += termcolor.colored(s, 'yellow')
            return new_name

    msg = ""
    author_list = res_bib['author'].split(' and ')
    author_list = [process_pair(a, b, msg) for a, b in zip(author_list, res_json['author'])]
    res_bib['author'] = ' and '.join(author_list)


def write_remap_script(subst, output_folder):
    """
    Write a bash script to replace old bibtex keys by new ones.
    """
    subst = {k: v for k, v in subst.items() if k != v}
    if len(subst) == 0:
        return
    s = """find . -name "*.tex" -exec sed -i 's/{0}/{1}/g' {{}} \\;"""
    outfile = os.path.join(output_folder, "remap.sh")
    if os.path.exists(outfile):
        cmd = input("overwrite existing file '{0}' (y/N) ".format(outfile))
        if cmd != 'y':
            return
    with open(outfile, 'w') as f:
        f.write("#! /bin/bash\n\n")
        for k, v in subst.items():
            if v.startswith(k):
                f.write(s.format(v, k) + '\n')
            f.write(s.format(k, v) + '\n')


if __name__ == "__main__":
    def main():
        s = 'Tim Van Hook'
        t = [{'family': 'Van Hook', 'given': 'Tim', 'affiliation': [], 'yay':1}]
        fix_author_field({'author': s}, {'author': t})
    main()