Merge pull request #22 from etalab/type_detection_by_header

Type detection by header
datagouv · Feb 9, 2022 · d0c2060 · d0c2060
2 parents abeca72 + 4759595
commit d0c2060
Show file tree

Hide file tree

Showing 55 changed files with 737 additions and 17 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,5 +11,6 @@ build
 csv_detective.egg-info
 dist
 venv
+.DS_Store
 
 analyse_csv.py
diff --git a/csv_detective/all_packages.txt b/csv_detective/all_packages.txt
@@ -48,5 +48,46 @@ csv_detective.detect_fields.FR.geo.insee_canton
 csv_detective.detect_fields.FR.other.date_fr
 csv_detective.detect_fields.FR.other.code_waldec
 csv_detective.detect_fields.FR.other.code_rna
+csv_detective.detect_labels.FR.geo.adresse
+csv_detective.detect_labels.FR.geo.code_commune_insee
+csv_detective.detect_labels.FR.geo.code_departement
+csv_detective.detect_labels.FR.geo.code_fantoir
+csv_detective.detect_labels.FR.geo.code_postal
+csv_detective.detect_labels.FR.geo.code_region
+csv_detective.detect_labels.FR.geo.commune
+csv_detective.detect_labels.FR.geo.departement
+csv_detective.detect_labels.FR.geo.insee_canton
+csv_detective.detect_labels.FR.geo.latitude_l93
+csv_detective.detect_labels.FR.geo.latitude_wgs_fr_metropole
+csv_detective.detect_labels.FR.geo.longitude_l93
+csv_detective.detect_labels.FR.geo.longitude_wgs_fr_metropole
+csv_detective.detect_labels.FR.geo.pays
+csv_detective.detect_labels.FR.geo.region
+csv_detective.detect_labels.FR.other.code_csp_insee
+csv_detective.detect_labels.FR.other.code_rna
+csv_detective.detect_labels.FR.other.code_waldec
+csv_detective.detect_labels.FR.other.csp_insee
+csv_detective.detect_labels.FR.other.date_fr
+csv_detective.detect_labels.FR.other.insee_ape700
+csv_detective.detect_labels.FR.other.sexe
+csv_detective.detect_labels.FR.other.siren
+csv_detective.detect_labels.FR.other.siret
+csv_detective.detect_labels.FR.other.tel_fr
+csv_detective.detect_labels.FR.other.uai
+csv_detective.detect_labels.FR.temp.jour_de_la_semaine
+csv_detective.detect_labels.FR.temp.mois_de_annee
+csv_detective.detect_labels.geo.iso_country_code
+csv_detective.detect_labels.geo.json_geojson
+csv_detective.detect_labels.geo.latitude_wgs
+csv_detective.detect_labels.geo.latlon_wgs
+csv_detective.detect_labels.geo.longitude_wgs
+csv_detective.detect_labels.other.booleen
+csv_detective.detect_labels.other.email
+csv_detective.detect_labels.other.floats
 csv_detective.detect_labels.other.money
-
+csv_detective.detect_labels.other.twitter
+csv_detective.detect_labels.other.url
+csv_detective.detect_labels.temp.date
+csv_detective.detect_labels.temp.datetime_iso
+csv_detective.detect_labels.temp.datetime_rfc822
+csv_detective.detect_labels.temp.year
diff --git a/csv_detective/detect_labels/FR/__init__.py b/csv_detective/detect_labels/FR/__init__.py
diff --git a/csv_detective/detect_labels/FR/geo/__init__.py b/csv_detective/detect_labels/FR/geo/__init__.py
diff --git a/csv_detective/detect_labels/FR/geo/adresse/__init__.py b/csv_detective/detect_labels/FR/geo/adresse/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['adresse', 'adresse postale', 'adresse geographique', 'adr', 'adresse complete', 'adresse station']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py b/csv_detective/detect_labels/FR/geo/code_commune_insee/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['code commune insee', 'code insee', 'codes insee']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/code_departement/__init__.py b/csv_detective/detect_labels/FR/geo/code_departement/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['code departement', 'dep'] #'dep': Possible confusion with dep name?
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py b/csv_detective/detect_labels/FR/geo/code_fantoir/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['cadastre1', 'code fantoir', 'fantoir']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/code_postal/__init__.py b/csv_detective/detect_labels/FR/geo/code_postal/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['code postal', 'postal code', 'postcode', 'post code', 'cp', 'codes postaux', 'location postcode']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/code_region/__init__.py b/csv_detective/detect_labels/FR/geo/code_region/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['code region', 'reg', 'code insee region'] #'reg' : possible confusion with region name?
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/commune/__init__.py b/csv_detective/detect_labels/FR/geo/commune/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['commune', 'ville', 'libelle commune']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/departement/__init__.py b/csv_detective/detect_labels/FR/geo/departement/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['departement', 'libelle du departement', 'deplib', 'nom dept', 'dept', 'libdepartement', 'nom departement', 'libelle dep', 'libelle departement', 'lb departements', 'dep libusage', 'lb departement', 'nom dep']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/insee_canton/__init__.py b/csv_detective/detect_labels/FR/geo/insee_canton/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['insee canton', 'canton', 'cant', 'nom canton']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py b/csv_detective/detect_labels/FR/geo/latitude_l93/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['latitude', 'lat', 'y', 'yf', 'yd', 'y l93', 'coordonnee y', 'latitude lb93', 'coord y', 'ycoord', 'geocodage y gps', 'location latitude', 'ylatitude', 'ylat', 'latitude (y)', 'latitudeorg', 'coordinates.latitude', 'googlemap latitude', 'latitudelieu', 'latitude googlemap'] #Does not always detect CRS
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_labels/FR/geo/latitude_wgs_fr_metropole/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['latitude', 'lat', 'y', 'yf', 'yd', 'coordonnee y', 'coord y', 'ycoord', 'geocodage y gps', 'location latitude', 'ylatitude', 'ylat', 'latitude (y)', 'latitudeorg', 'coordinates.latitude', 'googlemap latitude', 'latitudelieu', 'latitude googlemap', 'latitude wgs84', 'y wgs84', 'latitude (wgs84)']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py b/csv_detective/detect_labels/FR/geo/longitude_l93/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['longitude', 'lon', 'long', 'geocodage x gps', 'location longitude', 'xlongitude', 'lng', 'xlong', 'x', 'xf', 'xd'] #Does not detect CRS
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py b/csv_detective/detect_labels/FR/geo/longitude_wgs_fr_metropole/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['longitude', 'lon', 'long', 'geocodage x gps', 'location longitude', 'xlongitude', 'lng', 'xlong', 'x', 'xf', 'xd'] #Does not detect CRS
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/pays/__init__.py b/csv_detective/detect_labels/FR/geo/pays/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['pays', 'payslieu', 'paysorg', 'country', 'pays lib', 'lieupays', 'pays beneficiaire', 'nom du pays', 'journey start country', 'libelle pays', 'journey end country']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/geo/region/__init__.py b/csv_detective/detect_labels/FR/geo/region/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['region', 'libelle region', 'nom region', 'libelle reg', 'nom reg', 'reg libusage', 'nom de la region', 'regionorg', 'regionlieu', 'reg', 'nom officiel region']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/other/__init__.py b/csv_detective/detect_labels/FR/other/__init__.py
diff --git a/csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py b/csv_detective/detect_labels/FR/other/code_csp_insee/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['code csp insee', 'code csp']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/other/code_rna/__init__.py b/csv_detective/detect_labels/FR/other/code_rna/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['code rna', 'rna', 'n° inscription association', 'identifiant association']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/other/code_waldec/__init__.py b/csv_detective/detect_labels/FR/other/code_waldec/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['code waldec', 'waldec']
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/other/csp_insee/__init__.py b/csv_detective/detect_labels/FR/other/csp_insee/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['csp insee', 'csp', 'categorie socioprofessionnelle'] #To improve? No specific header found in data
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)
diff --git a/csv_detective/detect_labels/FR/other/date_fr/__init__.py b/csv_detective/detect_labels/FR/other/date_fr/__init__.py
@@ -0,0 +1,15 @@
+from csv_detective.utils import full_word_strictly_inside_string
+from csv_detective.process_text import _process_text
+
+PROPORTION = 1
+
+def _is(header):
+    '''Returns 1 if the (processed) header matches one of the expected words combination, else 0'''
+
+    words_combinations_list = ['date'] #To improve: no header specific to 'fr' found in data
+    processed_header = _process_text(header)
+
+    header_matches_words_combination = float(any([words_combination == processed_header for words_combination in words_combinations_list]))
+    words_combination_in_header = 0.5*float(any([full_word_strictly_inside_string(words_combination, processed_header) for words_combination in words_combinations_list]))
+
+    return max(header_matches_words_combination, words_combination_in_header)