From aab244671aa569f578c52566306c9d1ba568a323 Mon Sep 17 00:00:00 2001 From: Kaushik <108662423+kaushikravichandran@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:55:55 -0500 Subject: [PATCH] Hackernews search in EVADB (#1362) This PR supports searching Hackernews stories in EVADB: - We treat Hackernews as a data source - Algolia search API for hackernews is used for searching - Get requests are used - We support searching within: `story`, `comment`, `URL`, `story`, `poll` or a combination of them - Syntax: ``` params = { "query": "EVADB", "tags": "(story,poll)", } query = f"""CREATE DATABASE hackernews_data WITH ENGINE = "hackernews", PARAMETERS = {params};""" ``` --------- Co-authored-by: Kaushik Ravichandran --- docs/_toc.yml | 1 + .../source/reference/databases/hackernews.rst | 44 +++++ .../databases/hackernews/__init__.py | 15 ++ .../hackernews/hackernews_handler.py | 152 ++++++++++++++++++ .../databases/hackernews/table_column_info.py | 23 +++ evadb/third_party/databases/interface.py | 2 + setup.py | 3 + .../long/test_hackernews_datasource.py | 56 +++++++ 8 files changed, 296 insertions(+) create mode 100644 docs/source/reference/databases/hackernews.rst create mode 100644 evadb/third_party/databases/hackernews/__init__.py create mode 100644 evadb/third_party/databases/hackernews/hackernews_handler.py create mode 100644 evadb/third_party/databases/hackernews/table_column_info.py create mode 100644 test/integration_tests/long/test_hackernews_datasource.py diff --git a/docs/_toc.yml b/docs/_toc.yml index ca191ce42d..eb57363f4e 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -79,6 +79,7 @@ parts: - file: source/reference/databases/clickhouse - file: source/reference/databases/github - file: source/reference/databases/snowflake + - file: source/reference/databases/hackernews - file: source/reference/vector_databases/index title: Vector Databases diff --git a/docs/source/reference/databases/hackernews.rst b/docs/source/reference/databases/hackernews.rst new file mode 100644 index 0000000000..d96112e815 --- /dev/null +++ b/docs/source/reference/databases/hackernews.rst @@ -0,0 +1,44 @@ +Hackernews +========== + +The connection to Hackernews is based on the `Algolia Hackernews `_ API. + +Dependency +---------- + +* requests + + +Parameters +---------- + +Required: + +* ``query`` is the search query for getting the results. + +Optional: + +* ``tags`` is the tag used for filtering the query results. Check `available tags `_ to see a list of available filter tags. + +Create Connection +----------------- + +.. code-block:: text + + CREATE DATABASE hackernews_data WITH ENGINE = 'hackernews', PARAMETERS = { + "query": "EVADB", + "tags": "story" + }; + +Supported Tables +---------------- + +* ``search_results``: Lists the search query results. Check `table_column_info.py `_ for all the available columns in the table. + +.. code-block:: sql + + SELECT * FROM hackernews_data.search_results LIMIT 3; + +.. note:: + + Looking for another table from Hackernews? Please raise a `Feature Request `_. diff --git a/evadb/third_party/databases/hackernews/__init__.py b/evadb/third_party/databases/hackernews/__init__.py new file mode 100644 index 0000000000..7051570944 --- /dev/null +++ b/evadb/third_party/databases/hackernews/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""hackernews search integration""" diff --git a/evadb/third_party/databases/hackernews/hackernews_handler.py b/evadb/third_party/databases/hackernews/hackernews_handler.py new file mode 100644 index 0000000000..11025b27e5 --- /dev/null +++ b/evadb/third_party/databases/hackernews/hackernews_handler.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json + +import pandas as pd +import requests + +from evadb.third_party.databases.hackernews.table_column_info import HACKERNEWS_COLUMNS +from evadb.third_party.databases.types import ( + DBHandler, + DBHandlerResponse, + DBHandlerStatus, +) + + +class HackernewsSearchHandler(DBHandler): + def connection(): + return requests.get("https://www.google.com/").status_code == 200 + + def __init__(self, name: str, **kwargs): + """ + Initialize the handler. + Args: + name (str): name of the DB handler instance + **kwargs: arbitrary keyword arguments for establishing the connection. + """ + super().__init__(name) + self.query = kwargs.get("query", "") + self.tags = kwargs.get("tags", "") + + @property + def supported_table(self): + def _hackernews_topics_generator(): + url = "http://hn.algolia.com/api/v1/search?" + url += "query=" + self.query + url += "&tags=" + ( + "story" if self.tags == "" else +self.tags + ) # search stories by default + response = requests.get(url) + if response.status_code != 200: + raise Exception("Could not reach website.") + json_result = response.content + dict_result = json.loads(json_result) + for row in dict_result: + yield { + property_name: row[property_name] + for property_name, _ in HACKERNEWS_COLUMNS + } + + mapping = { + "search_results": { + "columns": HACKERNEWS_COLUMNS, + "generator": _hackernews_topics_generator(), + }, + } + return mapping + + def connect(self): + """ + Set up the connection required by the handler. + Returns: + DBHandlerStatus + """ + return DBHandlerStatus(status=True) + + def disconnect(self): + """ + Close any existing connections. + """ + pass + + def check_connection(self) -> DBHandlerStatus: + """ + Check connection to the handler. + Returns: + DBHandlerStatus + """ + if self.connection(): + return DBHandlerStatus(status=True) + else: + return DBHandlerStatus(status=False, error="Not connected to the internet.") + + def get_tables(self) -> DBHandlerResponse: + """ + Return the list of tables in the database. + Returns: + DBHandlerResponse + """ + if not self.connection(): + return DBHandlerResponse(data=None, error="Not connected to the internet.") + + try: + tables_df = pd.DataFrame( + list(self.supported_table.keys()), columns=["table_name"] + ) + return DBHandlerResponse(data=tables_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + def get_columns(self, table_name: str) -> DBHandlerResponse: + """ + Returns the list of columns for the given table. + Args: + table_name (str): name of the table whose columns are to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection(): + return DBHandlerResponse(data=None, error="Not connected to the internet.") + try: + columns_df = pd.DataFrame( + self.supported_table[table_name]["columns"], columns=["name", "dtype"] + ) + return DBHandlerResponse(data=columns_df) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) + + def select(self, table_name: str) -> DBHandlerResponse: + """ + Returns a generator that yields the data from the given table. + Args: + table_name (str): name of the table whose data is to be retrieved. + Returns: + DBHandlerResponse + """ + if not self.connection: + return DBHandlerResponse(data=None, error="Not connected to the database.") + try: + if table_name not in self.supported_table: + return DBHandlerResponse( + data=None, + error="{} is not supported or does not exist.".format(table_name), + ) + + return DBHandlerResponse( + data=None, + data_generator=self.supported_table[table_name]["generator"], + ) + except Exception as e: + return DBHandlerResponse(data=None, error=str(e)) diff --git a/evadb/third_party/databases/hackernews/table_column_info.py b/evadb/third_party/databases/hackernews/table_column_info.py new file mode 100644 index 0000000000..aae50e18c3 --- /dev/null +++ b/evadb/third_party/databases/hackernews/table_column_info.py @@ -0,0 +1,23 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +HACKERNEWS_COLUMNS = [ + ["title", str], + ["url", str], + ["author", str], + ["points", int], + ["story_text", str], + ["num_comments", int], +] diff --git a/evadb/third_party/databases/interface.py b/evadb/third_party/databases/interface.py index 5f8c4c2ac1..cacb4110f1 100644 --- a/evadb/third_party/databases/interface.py +++ b/evadb/third_party/databases/interface.py @@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs): return mod.SnowFlakeDbHandler(engine, **kwargs) elif engine == "github": return mod.GithubHandler(engine, **kwargs) + elif engine == "hackernews": + return mod.HackernewsSearchHandler(engine, **kwargs) elif engine == "slack": return mod.SlackHandler(engine, **kwargs) else: diff --git a/setup.py b/setup.py index 61dc0b8c66..0b0ed1ec97 100644 --- a/setup.py +++ b/setup.py @@ -124,6 +124,8 @@ def read(path, encoding="utf-8"): xgboost_libs = ["flaml[automl]"] +hackernews_libs = ["requests"] + forecasting_libs = [ "statsforecast", # MODEL TRAIN AND FINE TUNING "neuralforecast", # MODEL TRAIN AND FINE TUNING @@ -176,6 +178,7 @@ def read(path, encoding="utf-8"): "sklearn": sklearn_libs, "xgboost": xgboost_libs, "forecasting": forecasting_libs, + "hackernews": hackernews_libs, # everything except ray, qdrant, ludwig and postgres. The first three fail on pyhton 3.11. "dev": dev_libs + vision_libs + document_libs + function_libs + notebook_libs + forecasting_libs + sklearn_libs + imagegen_libs + xgboost_libs } diff --git a/test/integration_tests/long/test_hackernews_datasource.py b/test/integration_tests/long/test_hackernews_datasource.py new file mode 100644 index 0000000000..0cc3293d46 --- /dev/null +++ b/test/integration_tests/long/test_hackernews_datasource.py @@ -0,0 +1,56 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from test.util import get_evadb_for_testing + +import pytest + +from evadb.server.command_handler import execute_query_fetch_all +from evadb.third_party.databases.hackernews.table_column_info import HACKERNEWS_COLUMNS + + +@pytest.mark.notparallel +class HackernewsDataSourceTest(unittest.TestCase): + def setUp(self): + self.evadb = get_evadb_for_testing() + # reset the catalog manager before running each test + self.evadb.catalog().reset() + + def tearDown(self): + execute_query_fetch_all(self.evadb, "DROP DATABASE IF EXISTS hackernews_data;") + + @pytest.mark.xfail(reason="Flaky testcase due to `bad request` error message") + def test_should_run_select_query_in_hackernews(self): + # Create database. + params = { + "query": "EVADB", + "tags": "story", + } + query = f"""CREATE DATABASE hackernews_data + WITH ENGINE = "hackernews", + PARAMETERS = {params};""" + execute_query_fetch_all(self.evadb, query) + + query = "SELECT * FROM hackernews_data.search_results LIMIT 5;" + batch = execute_query_fetch_all(self.evadb, query) + self.assertEqual(len(batch), 10) + expected_column = list( + ["search_results.{}".format(col) for col, _ in HACKERNEWS_COLUMNS] + ) + self.assertEqual(batch.columns, expected_column) + + +if __name__ == "__main__": + unittest.main()