Skip to content

Commit

Permalink
Ignore tables without table_type from Glue and Hive
Browse files Browse the repository at this point in the history
* Ignore tables without table_type parameters while loading all iceberg table from Glue and Hive catalog (#1331)

* Use TABLE_TYPE

---------

Co-authored-by: Wenzhuo Zhao <zhaowenzhuo01@bilibili.com>
  • Loading branch information
gitzwz and Wenzhuo Zhao authored Nov 19, 2024
1 parent 2cbc77d commit a66ddc0
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 5 deletions.
2 changes: 1 addition & 1 deletion pyiceberg/catalog/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,4 +773,4 @@ def drop_view(self, identifier: Union[str, Identifier]) -> None:

@staticmethod
def __is_iceberg_table(table: TableTypeDef) -> bool:
return table.get("Parameters", {}).get("table_type", "").lower() == ICEBERG
return table.get("Parameters", {}).get(TABLE_TYPE, "").lower() == ICEBERG
2 changes: 1 addition & 1 deletion pyiceberg/catalog/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]:
for table in open_client.get_table_objects_by_name(
dbname=database_name, tbl_names=open_client.get_all_tables(db_name=database_name)
)
if table.parameters[TABLE_TYPE].lower() == ICEBERG
if table.parameters.get(TABLE_TYPE, "").lower() == ICEBERG
]

def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]:
Expand Down
10 changes: 10 additions & 0 deletions tests/catalog/test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ def test_list_tables(
test_catalog.create_namespace(namespace=database_name)

non_iceberg_table_name = "non_iceberg_table"
non_table_type_table_name = "non_table_type_table"
glue_client = boto3.client("glue", endpoint_url=moto_endpoint_url)
glue_client.create_table(
DatabaseName=database_name,
Expand All @@ -458,12 +459,21 @@ def test_list_tables(
"Parameters": {"table_type": "noniceberg"},
},
)
glue_client.create_table(
DatabaseName=database_name,
TableInput={
"Name": non_table_type_table_name,
"TableType": "OTHER_TABLE_TYPE",
"Parameters": {},
},
)

for table_name in table_list:
test_catalog.create_table((database_name, table_name), table_schema_nested)
loaded_table_list = test_catalog.list_tables(database_name)

assert (database_name, non_iceberg_table_name) not in loaded_table_list
assert (database_name, non_table_type_table_name) not in loaded_table_list
for table_name in table_list:
assert (database_name, table_name) in loaded_table_list

Expand Down
10 changes: 7 additions & 3 deletions tests/catalog/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,16 +919,20 @@ def test_list_tables(hive_table: HiveTable) -> None:
tbl3.tableName = "table3"
tbl3.dbName = "database"
tbl3.parameters["table_type"] = "non_iceberg"
tbl4 = deepcopy(hive_table)
tbl4.tableName = "table4"
tbl4.dbName = "database"
tbl4.parameters.pop("table_type")

catalog._client = MagicMock()
catalog._client.__enter__().get_all_tables.return_value = ["table1", "table2", "table3"]
catalog._client.__enter__().get_table_objects_by_name.return_value = [tbl1, tbl2, tbl3]
catalog._client.__enter__().get_all_tables.return_value = ["table1", "table2", "table3", "table4"]
catalog._client.__enter__().get_table_objects_by_name.return_value = [tbl1, tbl2, tbl3, tbl4]

got_tables = catalog.list_tables("database")
assert got_tables == [("database", "table1"), ("database", "table2")]
catalog._client.__enter__().get_all_tables.assert_called_with(db_name="database")
catalog._client.__enter__().get_table_objects_by_name.assert_called_with(
dbname="database", tbl_names=["table1", "table2", "table3"]
dbname="database", tbl_names=["table1", "table2", "table3", "table4"]
)


Expand Down

0 comments on commit a66ddc0

Please sign in to comment.