Skip to content

Commit

Permalink
Merge pull request #22 from etalab/type_detection_by_header
Browse files Browse the repository at this point in the history
Type detection by header
  • Loading branch information
geoffreyaldebert authored Feb 9, 2022
2 parents abeca72 + 4759595 commit d0c2060
Show file tree
Hide file tree
Showing 55 changed files with 737 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ build
csv_detective.egg-info
dist
venv
.DS_Store

analyse_csv.py
43 changes: 42 additions & 1 deletion csv_detective/all_packages.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,46 @@ csv_detective.detect_fields.FR.geo.insee_canton
csv_detective.detect_fields.FR.other.date_fr
csv_detective.detect_fields.FR.other.code_waldec
csv_detective.detect_fields.FR.other.code_rna
csv_detective.detect_labels.FR.geo.adresse
csv_detective.detect_labels.FR.geo.code_commune_insee
csv_detective.detect_labels.FR.geo.code_departement
csv_detective.detect_labels.FR.geo.code_fantoir
csv_detective.detect_labels.FR.geo.code_postal
csv_detective.detect_labels.FR.geo.code_region
csv_detective.detect_labels.FR.geo.commune
csv_detective.detect_labels.FR.geo.departement
csv_detective.detect_labels.FR.geo.insee_canton
csv_detective.detect_labels.FR.geo.latitude_l93
csv_detective.detect_labels.FR.geo.latitude_wgs_fr_metropole
csv_detective.detect_labels.FR.geo.longitude_l93
csv_detective.detect_labels.FR.geo.longitude_wgs_fr_metropole
csv_detective.detect_labels.FR.geo.pays
csv_detective.detect_labels.FR.geo.region
csv_detective.detect_labels.FR.other.code_csp_insee
csv_detective.detect_labels.FR.other.code_rna
csv_detective.detect_labels.FR.other.code_waldec
csv_detective.detect_labels.FR.other.csp_insee
csv_detective.detect_labels.FR.other.date_fr
csv_detective.detect_labels.FR.other.insee_ape700
csv_detective.detect_labels.FR.other.sexe
csv_detective.detect_labels.FR.other.siren
csv_detective.detect_labels.FR.other.siret
csv_detective.detect_labels.FR.other.tel_fr
csv_detective.detect_labels.FR.other.uai
csv_detective.detect_labels.FR.temp.jour_de_la_semaine
csv_detective.detect_labels.FR.temp.mois_de_annee
csv_detective.detect_labels.geo.iso_country_code
csv_detective.detect_labels.geo.json_geojson
csv_detective.detect_labels.geo.latitude_wgs
csv_detective.detect_labels.geo.latlon_wgs
csv_detective.detect_labels.geo.longitude_wgs
csv_detective.detect_labels.other.booleen
csv_detective.detect_labels.other.email
csv_detective.detect_labels.other.floats
csv_detective.detect_labels.other.money

csv_detective.detect_labels.other.twitter
csv_detective.detect_labels.other.url
csv_detective.detect_labels.temp.date
csv_detective.detect_labels.temp.datetime_iso
csv_detective.detect_labels.temp.datetime_rfc822
csv_detective.detect_labels.temp.year
Empty file.
Empty file.
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/adresse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['adresse', 'adresse postale', 'adresse geographique', 'adr', 'adresse complete', 'adresse station']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['code commune insee', 'code insee', 'codes insee']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/code_departement/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['code departement', 'dep'] #'dep': Possible confusion with dep name?
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['cadastre1', 'code fantoir', 'fantoir']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/code_postal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['code postal', 'postal code', 'postcode', 'post code', 'cp', 'codes postaux', 'location postcode']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/code_region/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['code region', 'reg', 'code insee region'] #'reg' : possible confusion with region name?
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/commune/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['commune', 'ville', 'libelle commune']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/departement/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['departement', 'libelle du departement', 'deplib', 'nom dept', 'dept', 'libdepartement', 'nom departement', 'libelle dep', 'libelle departement', 'lb departements', 'dep libusage', 'lb departement', 'nom dep']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/insee_canton/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['insee canton', 'canton', 'cant', 'nom canton']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['latitude', 'lat', 'y', 'yf', 'yd', 'y l93', 'coordonnee y', 'latitude lb93', 'coord y', 'ycoord', 'geocodage y gps', 'location latitude', 'ylatitude', 'ylat', 'latitude (y)', 'latitudeorg', 'coordinates.latitude', 'googlemap latitude', 'latitudelieu', 'latitude googlemap'] #Does not always detect CRS
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['latitude', 'lat', 'y', 'yf', 'yd', 'coordonnee y', 'coord y', 'ycoord', 'geocodage y gps', 'location latitude', 'ylatitude', 'ylat', 'latitude (y)', 'latitudeorg', 'coordinates.latitude', 'googlemap latitude', 'latitudelieu', 'latitude googlemap', 'latitude wgs84', 'y wgs84', 'latitude (wgs84)']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['longitude', 'lon', 'long', 'geocodage x gps', 'location longitude', 'xlongitude', 'lng', 'xlong', 'x', 'xf', 'xd'] #Does not detect CRS
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['longitude', 'lon', 'long', 'geocodage x gps', 'location longitude', 'xlongitude', 'lng', 'xlong', 'x', 'xf', 'xd'] #Does not detect CRS
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/pays/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['pays', 'payslieu', 'paysorg', 'country', 'pays lib', 'lieupays', 'pays beneficiaire', 'nom du pays', 'journey start country', 'libelle pays', 'journey end country']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/geo/region/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['region', 'libelle region', 'nom region', 'libelle reg', 'nom reg', 'reg libusage', 'nom de la region', 'regionorg', 'regionlieu', 'reg', 'nom officiel region']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
Empty file.
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['code csp insee', 'code csp']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/other/code_rna/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['code rna', 'rna', 'n° inscription association', 'identifiant association']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/other/code_waldec/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['code waldec', 'waldec']
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/other/csp_insee/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['csp insee', 'csp', 'categorie socioprofessionnelle'] #To improve? No specific header found in data
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
15 changes: 15 additions & 0 deletions csv_detective/detect_labels/FR/other/date_fr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from csv_detective.utils import full_word_strictly_inside_string
from csv_detective.process_text import _process_text

PROPORTION = 1

def _is(header):
'''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''

words_combinations_list = ['date'] #To improve: no header specific to 'fr' found in data
processed_header = _process_text(header)

header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))

return max(header_matches_words_combination, words_combination_in_header)
Loading

0 comments on commit d0c2060

Please sign in to comment.