diff --git a/mira/dkg/client.py b/mira/dkg/client.py index 972a84c7..43b1b87d 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -803,7 +803,7 @@ def get_terms( ) for synonym in synonyms or []: norm_text = normalize(synonym) - if norm_text: + if norm_text.strip(): yield Term( norm_text=norm_text, text=synonym, diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 3b8b04ae..c136f770 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -985,6 +985,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: if parse_results.graph_document is None: click.secho(f"No graphs in {prefix}, skipping", fg="red") + use_case_paths.EDGES_PATHS.pop(prefix) continue _graphs = parse_results.graph_document.graphs @@ -1104,15 +1105,14 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: if add_xref_edges: for xref in node.xrefs: - if not isinstance(xref, Xref): + if not isinstance(xref, obograph.Xref): raise TypeError(f"Invalid type: {type(xref)}: {xref}") if not xref.value: continue if xref.value.prefix in obograph.PROVENANCE_PREFIXES: # Don't add provenance information as xrefs continue - edges.append( - ( + xref_edge_info = ( node.curie, xref.value.curie, "xref", @@ -1121,7 +1121,8 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: graph_id, version or "", ) - ) + if xref_edge_info not in edges: + edges.append(xref_edge_info) if xref.value.curie not in nodes: node_sources[node.replaced_by].add(prefix) nodes[xref.value.curie] = NodeInfo( diff --git a/mira/dkg/resources/geonames.py b/mira/dkg/resources/geonames.py index 1f25613a..2b782158 100644 --- a/mira/dkg/resources/geonames.py +++ b/mira/dkg/resources/geonames.py @@ -109,14 +109,20 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul ), ) + cities_df = cities_df[cities_df.population.astype(int) > minimum_population] cities_df.synonyms = cities_df.synonyms.str.split(",") terms = {} for term in code_to_country.values(): terms[term.identifier] = term + for term in code_to_admin1.values(): + terms[term.identifier] = term + for term in code_to_admin2.values(): + terms[term.identifier] = term cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", "admin2", "population"] for identifier, name, synonyms, country, admin1, admin2, population in (cities_df[cols].values): + terms[identifier] = term = Term.from_triple("geonames", identifier,name) if synonyms and not isinstance(synonyms, float): for synoynm in synonyms: term.append_synonym(synoynm) @@ -131,8 +137,6 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul print("could not find admin1", admin1_full) continue - terms[admin1_term.identifier] = admin1_term - if pd.notna(admin2): admin2_full = f"{country}.{admin1}.{admin2}" admin2_term = code_to_admin2.get(admin2_full) @@ -141,15 +145,9 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul # print("could not find admin2", admin2_full) else: term.append_relationship(part_of, admin2_term) - terms[admin2_term.identifier] = admin2_term else: # pd.notna(admin1): # If there's no admin 2, just annotate directly onto admin 1 term.append_relationship(part_of, admin1_term) - # We skip cities that don't meet the minimum population requirement - if int(population) < minimum_population: - continue - terms[identifier] = term = Term.from_triple("geonames", identifier, - name) return terms