Skip to content

Commit

Permalink
Merge pull request #24 from voi-oss/tableau-exposure-crawler-refactor
Browse files Browse the repository at this point in the history
chore(tableau-exposer-crawler): using batch API to fetch data
  • Loading branch information
samanmasarat authored Jan 25, 2024
2 parents 7aeb458 + 088cfe0 commit 7074c91
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 25 deletions.
60 changes: 44 additions & 16 deletions src/exposurescrawler/crawlers/tableau.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _should_ignore_workbook(workbook, projects_to_ignore: Collection[str]) -> bo
# by workbooks under projects without a name.
if not workbook.project_name:
return True

return workbook.project_name in projects_to_ignore


Expand All @@ -41,7 +41,6 @@ def _parse_tables_from_sql(workbooks_sqls: WorkbookModelsMapping, models) -> Wor
logger().info('⚙️ Parsing SQL: looking for references to models')

output: WorkbookModelsMapping = {}

for workbook_reference, custom_sqls in workbooks_sqls.items():
# a list of dbt model represented as their original dicts from the manifest
all_found: List[dict] = []
Expand All @@ -66,11 +65,38 @@ def _parse_tables_from_sql(workbooks_sqls: WorkbookModelsMapping, models) -> Wor
return output


def retrieve_all_workbook_owner_map(tableau_client: TableauRestClient):
"""
:param tableau_client: Tableau rest client
:return: the dictionary of {workbook_id, WorkbookItem}
"""
logger().info('⚙️ Retrieving all workbooks (batch)')

all_workbooks = tableau_client.retrieve_all_workbooks()
logger().info(f'✅ Fetched {len(all_workbooks)} workbooks')
return dict((workbook.id, workbook) for workbook in all_workbooks)


def retrieve_all_user_id_map(tableau_client: TableauRestClient):
"""
:param tableau_client: Tableau rest client
:return: the dictionary of {user_id, UserItem}
"""
logger().info('⚙️ Retrieving all users (batch)')

all_users = tableau_client.retrieve_all_users()
logger().info(f'⚙️ Fetched {len(all_users)} users')

return dict((user.id, user) for user in all_users)


def tableau_crawler(
manifest_path: str,
dbt_package_name: str,
tableau_projects_to_ignore: Collection[str],
verbose: bool,
manifest_path: str,
dbt_package_name: str,
tableau_projects_to_ignore: Collection[str],
verbose: bool,
) -> None:
# Enable verbose logging
if verbose:
Expand Down Expand Up @@ -107,7 +133,7 @@ def tableau_crawler(
workbooks_models: WorkbookModelsMapping = {}

for workbook_reference, found in itertools.chain(
workbooks_custom_sql_models.items(), workbooks_native_sql_models.items()
workbooks_custom_sql_models.items(), workbooks_native_sql_models.items()
):
workbooks_models.setdefault(workbook_reference, []).extend(found)

Expand All @@ -123,12 +149,15 @@ def tableau_crawler(
logger().info('')
logger().info('🌏 Retrieving workbooks and authors metadata from the Tableau REST API')

# Fetching all workbooks and users using Tableau batch API and keep in a dictionary.
workbook_owner_map = retrieve_all_workbook_owner_map(tableau_client)
user_userid_map = retrieve_all_user_id_map(tableau_client)

# For every workbook and the models found, create exposures and add
# to the manifest (in-memory)
for workbook_reference, found in workbooks_models.items():
workbook = tableau_client.retrieve_workbook(workbook_reference.id)
owner = tableau_client.retrieve_user(workbook.owner_id)

workbook = workbook_owner_map[workbook_reference.id]
owner = user_userid_map[workbook.owner_id]
if _should_ignore_workbook(workbook, tableau_projects_to_ignore):
logger().debug(
f'⏩ Skipping workbook: {workbook.name} ({workbook.project_name} is ignored)'
Expand All @@ -137,7 +166,6 @@ def tableau_crawler(

exposure = DbtExposure.from_tableau_workbook(dbt_package_name, workbook, owner, found)
manifest.add_exposure(exposure, found)

# Persist the modified manifest
logger().info('')
logger().info(f'💾 Writing results to file: {manifest_path}')
Expand All @@ -156,7 +184,7 @@ def tableau_crawler(
required=True,
metavar='PROJECT_NAME',
help='The name of the dbt pacakge where the exposures should be added. If in doubt, check the '
'name of your dbt project on dbt_project.yml',
'name of your dbt project on dbt_project.yml',
)
@click.option(
'--tableau-ignore-projects',
Expand All @@ -166,10 +194,10 @@ def tableau_crawler(
)
@click.option('-v', '--verbose', is_flag=True, default=False, help='Enable verbose logging')
def tableau_crawler_command(
manifest_path: str,
dbt_package_name: str,
tableau_projects_to_ignore: Collection[str],
verbose: bool,
manifest_path: str,
dbt_package_name: str,
tableau_projects_to_ignore: Collection[str],
verbose: bool,
):
tableau_crawler(manifest_path, dbt_package_name, tableau_projects_to_ignore, verbose)

Expand Down
12 changes: 12 additions & 0 deletions src/exposurescrawler/tableau/rest_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,15 @@ def run_metadata_api(self, query: str):
response = self.server.metadata.query(query)

return response['data']

def retrieve_all_workbooks(self):
with self.server.auth.sign_in(self.tableau_auth):
all_workbooks = list(TSC.Pager(self.server.workbooks))

return all_workbooks

def retrieve_all_users(self):
with self.server.auth.sign_in(self.tableau_auth):
all_users = list(TSC.Pager(self.server.users))

return all_users
27 changes: 18 additions & 9 deletions tests/_integration/test_tableau_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def mock_graphql_custom_sql_result():
},
{
'query': "select * from sample_db.public.customers left join "
"sample_db.public.orders on customers.id = orders.customer_id",
"sample_db.public.orders on customers.id = orders.customer_id",
'name': 'Custom SQL Query',
'isEmbedded': None,
'database': {'name': 'SAMPLE_DB', 'connectionType': 'snowflake'},
Expand Down Expand Up @@ -150,22 +150,31 @@ def mock_tableau_rest_api():
UserDetailsMock = namedtuple('UserDetailsMock', ['id', 'fullname', 'name'])

workbook_details = {
'customers-workbook-luid': WorkbookDetailsMock(id='aaa', name='Customers workbook'),
'company-kpis-workbook-luid': WorkbookDetailsMock(id='bbb', name='Company KPIs workbook'),
'customers-workbook-luid': WorkbookDetailsMock(id='customers-workbook-luid', name='Customers workbook',
owner_id='user-id'),
'company-kpis-workbook-luid': WorkbookDetailsMock(id='company-kpis-workbook-luid', name='Company KPIs workbook',
owner_id='user-id'),
'orders-workbook-luid': WorkbookDetailsMock(
id='ccc', name='Orders workbook', tags=['certified']
id='orders-workbook-luid', name='Orders workbook', tags=['certified'], owner_id='user-id'
),
}
user_details = {
'user-id': UserDetailsMock('user-id', 'John Doe', 'john.doe@example.com')
}

def _get_workbook_details(workbook_id):
return workbook_details[workbook_id]

with patch('exposurescrawler.crawlers.tableau.TableauRestClient', autospec=True) as mock:
instance = mock.return_value
instance.retrieve_workbook.side_effect = _get_workbook_details
instance.retrieve_user.return_value = UserDetailsMock(
'user-id', 'John Doe', 'john.doe@example.com'
)
instance.retrieve_user.return_value = user_details['user-id']
instance.retrieve_all_users.return_value = [user_details['user-id']]
instance.retrieve_all_workbooks.return_value = [
workbook_details['customers-workbook-luid'],
workbook_details['company-kpis-workbook-luid'],
workbook_details['orders-workbook-luid']
]
yield


Expand All @@ -183,10 +192,10 @@ def test_tableau_crawler(manifest_path):
tableau_crawler(manifest_path, 'jaffle_shop', [], True)

final_manifest = mock.call_args.args[0].data
exposure = final_manifest['exposures']['exposure.jaffle_shop.tableau_orders_workbook_ccc']
exposure = final_manifest['exposures']['exposure.jaffle_shop.tableau_orders_workbook_ord']

assert len(final_manifest['exposures']) == 3
assert exposure['name'] == 'tableau_orders_workbook_ccc'
assert exposure['name'] == 'tableau_orders_workbook_ord'
assert 'Workbook description' in exposure['description']
assert 'https://my-tableau-server.com/path/to/workbook' in exposure['description']
assert exposure['type'] == 'Dashboard'
Expand Down

0 comments on commit 7074c91

Please sign in to comment.