Skip to content

Commit

Permalink
latin2ascii.py was moved as a utility
Browse files Browse the repository at this point in the history
git-svn-id: https://pdfminerr.googlecode.com/svn/trunk/pdfminer@215 1aa58f4a-7d42-0410-adbc-911cccaed67c
  • Loading branch information
yusuke.shinyama.dummy committed May 5, 2010
1 parent 7f587ca commit 8e92ddc
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 5 deletions.
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@
url='http://www.unixuser.org/~euske/python/pdfminer/index.html',
packages=[
'pdfminer',
'pdfminer.cmap'
'pdfminer.cmap',
],
scripts=[
'tools/pdf2txt.py',
'tools/dumppdf.py'
'tools/dumppdf.py',
'tools/latin2ascii.py',
],
keywords=['pdf parser', 'pdf converter', 'text mining'],
classifiers=[
Expand Down
46 changes: 43 additions & 3 deletions pdfminer/latin2ascii.py → tools/latin2ascii.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
#!/usr/bin/env python
#
# latin2ascii.py - converts latin1 characters into ascii.
#

import sys

""" Mappings from Latin-1 characters to ASCII.
This is an in-house mapping table for some Latin-1 characters
(acutes, umlauts, etc.) to ASCII strings.
This file is *not* used currently.
"""

LATIN2ASCII = {
#0x00a0: '',
#0x00a7: '',

# iso-8859-1
0x00c0: 'A`',
0x00c1: "A'",
Expand Down Expand Up @@ -87,4 +92,39 @@
0xfb05: 'ft',
0xfb06: 'st',

# Symbols
#0x2013: '',
0x2014: '--',
0x2015: '||',
0x2018: '`',
0x2019: "'",
0x201c: '``',
0x201d: "''",
#0x2022: '',
#0x2212: '',

}

def latin2ascii(s):
return ''.join( LATIN2ASCII.get(ord(c),c) for c in s )


def main(argv):
import getopt, fileinput
def usage():
print 'usage: %s [-c codec] file ...' % argv[0]
return 100
try:
(opts, args) = getopt.getopt(argv[1:], 'c')
except getopt.GetoptError:
return usage()
if not args: return usage()
codec = 'utf-8'
for (k, v) in opts:
if k == '-c': codec = v
for line in fileinput.input(args):
line = latin2ascii(unicode(line, codec, 'ignore'))
sys.stdout.write(line.encode('ascii', 'replace'))
return

if __name__ == '__main__': sys.exit(main(sys.argv))

0 comments on commit 8e92ddc

Please sign in to comment.