diff --git a/config.py b/config.py index fe926b2..f61d410 100644 --- a/config.py +++ b/config.py @@ -842,4 +842,10 @@ # thesis year range (befoer and after) ORACLE_SERVICE_THESIS_YEAR_DELTA = 5 # other doctype (ie, erratum, bookreview, etc) year range -ORACLE_SERVICE_GENERAL_YEAR_DELTA = 2 \ No newline at end of file +ORACLE_SERVICE_GENERAL_YEAR_DELTA = 2 + + +# number of records that can be inserted/updated in one call +ORACLE_MAX_RECORDS_ADD = 100 +# number of records that can be deleted in one call +ORACLE_MAX_RECORDS_DEL = 100 diff --git a/oraclesrv/doc_matching.py b/oraclesrv/doc_matching.py index b6f586b..9605977 100644 --- a/oraclesrv/doc_matching.py +++ b/oraclesrv/doc_matching.py @@ -41,6 +41,7 @@ def __init__(self, payload, save=True): self.match_doctype = get_requests_params(payload, 'match_doctype', default_type=list) self.source_bibcode = get_requests_params(payload, 'bibcode') self.save_to_db = save + self.extra_filter = get_requests_params(payload, 'extra_filter') def create_and_return_response(self, match, query, comment=None): """ diff --git a/oraclesrv/tests/unittests/test_oracle_service.py b/oraclesrv/tests/unittests/test_oracle_service.py index 3029525..062e805 100644 --- a/oraclesrv/tests/unittests/test_oracle_service.py +++ b/oraclesrv/tests/unittests/test_oracle_service.py @@ -11,6 +11,7 @@ from requests.exceptions import HTTPError from requests.models import Response +import oraclesrv.app as app from oraclesrv.tests.unittests.base import TestCaseDatabase from oraclesrv.views import get_user_info_from_adsws, cleanup, list_tmps, list_multis from oraclesrv.score import clean_metadata, get_matches, to_unicode @@ -635,25 +636,23 @@ def test_docmatch_endpoint_no_abstract_source(self, mock_query_eprint_bibstem): 'confidence': 0.8989977, 'matched': 1, 'scores': {'abstract': None, 'title': 1.0, 'author': 1, 'year': 1}}]) - # def test_query_endpoint(self): - # """ - # Test query endpoint with and without params passing in - # :return: - # """ - # r = self.client.post(path='/query') - # result = json.loads(r.data) - # self.assertDictEqual(result, {'params': {'rows': 2000, 'start': 0, 'date_cutoff': '1972-01-01 00:00:00+00:00'}, 'results': [['2018arXiv180310259Z', '2017PhDT........67Z', 0.8730186], ['2017arXiv171011147R', '2018Natur.556..473R', 0.8745491], ['2019arXiv190500882A', '2019JHEP...06..121A', 0.8960806], ['2019arXiv190804722C', '2019JHEP...10..244S', 0.8865355], ['2020arXiv200210896G', '2020A&A...635A.193G', 0.8988905], ['2019arXiv190802041G', '2020Icar..33613407G', 0.8766192], ['2022arXiv221016332G', '2023A&A...669A...7G', 0.9859276]]}) - # - # # set the rows to a larger number and see that it is reset - # r = self.client.post(path='/query', data=json.dumps({'rows': 3000, 'start': 0})) - # result = json.loads(r.data) - # self.assertDictEqual(result, {'params': {'rows': 2000, 'start': 0, 'date_cutoff': '1972-01-01 00:00:00+00:00'}, 'results': [['2018arXiv180310259Z', '2017PhDT........67Z', 0.8730186], ['2017arXiv171011147R', '2018Natur.556..473R', 0.8745491], ['2019arXiv190500882A', '2019JHEP...06..121A', 0.8960806], ['2019arXiv190804722C', '2019JHEP...10..244S', 0.8865355], ['2020arXiv200210896G', '2020A&A...635A.193G', 0.8988905], ['2019arXiv190802041G', '2020Icar..33613407G', 0.8766192], ['2022arXiv221016332G', '2023A&A...669A...7G', 0.9859276]]}) + def test_query_endpoint(self): + """ + Test query endpoint with and without params passing in + """ + r = self.client.post(path='/query') + result = json.loads(r.data) + self.assertDictEqual(result, {'params': {'rows': 2000, 'start': 0, 'date_cutoff': '1972-01-01 00:00:00+00:00'}, 'results': [['2018arXiv180310259Z', '2017PhDT........67Z', 0.8730186], ['2017arXiv171011147R', '2018Natur.556..473R', 0.8745491], ['2019arXiv190500882A', '2019JHEP...06..121A', 0.8960806], ['2019arXiv190804722C', '2019JHEP...10..244S', 0.8865355], ['2020arXiv200210896G', '2020A&A...635A.193G', 0.8988905], ['2019arXiv190802041G', '2020Icar..33613407G', 0.8766192], ['2022arXiv221016332G', '2023A&A...669A...7G', 0.9859276]]}) + + # set the rows to a larger number and see that it is reset + r = self.client.post(path='/query', data=json.dumps({'rows': 3000, 'start': 0})) + result = json.loads(r.data) + self.assertDictEqual(result, {'params': {'rows': 2000, 'start': 0, 'date_cutoff': '1972-01-01 00:00:00+00:00'}, 'results': [['2018arXiv180310259Z', '2017PhDT........67Z', 0.8730186], ['2017arXiv171011147R', '2018Natur.556..473R', 0.8745491], ['2019arXiv190500882A', '2019JHEP...06..121A', 0.8960806], ['2019arXiv190804722C', '2019JHEP...10..244S', 0.8865355], ['2020arXiv200210896G', '2020A&A...635A.193G', 0.8988905], ['2019arXiv190802041G', '2020Icar..33613407G', 0.8766192], ['2022arXiv221016332G', '2023A&A...669A...7G', 0.9859276]]}) @mock.patch('oraclesrv.utils.query_eprint_bibstem') def test_get_matches(self, mock_query_eprint_bibstem): """ - :return: """ # mock the eprint_bibstem patterns mock_query_eprint_bibstem.return_value = ( @@ -721,7 +720,6 @@ def test_get_matches(self, mock_query_eprint_bibstem): def test_get_match_for_pub_with_doi(self, mock_query_eprint_bibstem): """ Test matching publication with doi - :return: """ # mock the eprint_bibstem patterns mock_query_eprint_bibstem.return_value = ( @@ -916,7 +914,6 @@ def test_list_tmps_get(self): def test_list_multis_get(self): """ Test list_multis endpoint - :return: """ return_value = [ ["1995hep.ph....2279S", "1995PThPS.120...57S", 1.1], @@ -932,7 +929,6 @@ def test_list_multis_get(self): def test_get_solr_data_chunk(self): """ - :return: """ def create_response(text): @@ -1048,6 +1044,201 @@ def create_response(text): self.assertEqual(len(bibcodes), len(expected)) self.assertEqual(docs, expected) + def test_query_doi(self): + """ + Test the query_doi function of DocMatching when solr returns no results or no matches are found. + """ + payload = { + 'doi': ['10.1234/mock.doi'], + 'doctype': 'eprint', + 'match_doctype': ['article'] + } + comment = 'some DOI query' + doc_match = DocMatching(payload) + + with mock.patch('oraclesrv.doc_matching.get_solr_data_match_doi') as mock_get_solr_data_match_doi: + with mock.patch.object(self.current_app.logger, 'debug') as mock_debug: + + # when solr returns no results + mock_get_solr_data_match_doi.return_value = ([], 'mock_query', 200) + result, updated_comment = doc_match.query_doi(comment) + + self.assertIsNone(result) + self.assertIn('No result from solr with DOI', updated_comment) + mock_debug.assert_any_call('No result from solr with DOI %s.' % payload['doi']) + + # when solr returns results, but no matches are found + mock_get_solr_data_match_doi.return_value = ([{'bibcode': '2000Bibcode.......A'}], 'mock_query', 200) + with mock.patch('oraclesrv.doc_matching.get_doi_match') as mock_get_doi_match: + mock_get_doi_match.return_value = None + + result, updated_comment = doc_match.query_doi(comment) + + self.assertIsNone(result) + self.assertIn('No matches with DOI', updated_comment) + mock_debug.assert_any_call('No matches with DOI %s, trying Abstract.' % payload['doi']) + + def test_query_pubnote(self): + """ + Test the query_pubnote function of DocMatching when solr returns no results or no matches are found. + """ + payload = { + 'doi': ['10.1234/mock.doi'], + 'doctype': 'article', + 'match_doctype': ['eprint'] + } + comment = 'some pubnote query' + doc_match = DocMatching(payload) + + with mock.patch('oraclesrv.doc_matching.get_solr_data_match_pubnote') as mock_get_solr_data_match_pubnote: + with mock.patch.object(self.current_app.logger, 'debug') as mock_debug: + + # when solr call to return no results + mock_get_solr_data_match_pubnote.return_value = ([], 'mock_query', 200) + + # Test the first `else` block (no results from Solr) + result, updated_comment = doc_match.query_pubnote(comment) + self.assertIsNone(result) + self.assertIn('No result from solr with DOI', updated_comment) + mock_debug.assert_any_call('No result from solr with DOI %s in pubnote.' % payload['doi']) + + with mock.patch('oraclesrv.doc_matching.get_doi_match') as mock_get_doi_match: + + # when solr call to return results but no match is found + mock_get_solr_data_match_pubnote.return_value = ([{'mock': 'data'}], 'mock_query', 200) + mock_get_doi_match.return_value = None + + # Test the second `else` block (no matches found) + result, updated_comment = doc_match.query_pubnote(comment) + self.assertIsNone(result) + self.assertIn('No matches with DOI', updated_comment) + mock_debug.assert_any_call('No matches with DOI %s in pubnote, trying Abstract.' % payload['doi']) + + + def test_query_abstract_or_title(self): + """ + Test query_abstract_or_title of DocMatching when no matches are found with abstract, and it retries with title. + """ + payload = { + 'abstract': 'Mock abstract text.', + 'title': 'Mock title text.', + 'doctype': 'article', + 'match_doctype': ['eprint'], + } + comment = 'some abstract/title query' + doc_match = DocMatching(payload) + + with mock.patch('oraclesrv.doc_matching.get_solr_data_match') as mock_get_solr_data_match, \ + mock.patch('oraclesrv.doc_matching.get_matches', return_value=[]), \ + mock.patch.object(self.current_app.logger, 'debug') as mock_debug: + + mock_get_solr_data_match.side_effect = [ + ([{'bibcode': '2000Bibcode.......A'}], 'mock_query_with_abstract', 200), # some results with abstract, but no match + ([], 'mock_query_with_abstract', 400) # no results with title + ] + + result = doc_match.query_abstract_or_title(comment) + mock_debug.assert_any_call('No matches with Abstract, trying Title.') + + def test_query_doctype(self): + """ + Test query_doctype function of DocMatching when no matches are found from Solr results + """ + payload = { + 'author': 'Mock Author', + 'year': 2021, + 'doctype': 'article', + 'match_doctype': ['eprint'] + } + comment = 'some doctype query' + doc_match = DocMatching(payload) + + with mock.patch('oraclesrv.doc_matching.get_solr_data_match_doctype_case') as mock_get_solr_data_match_doctype_case, \ + mock.patch('oraclesrv.doc_matching.get_matches', return_value=None), \ + mock.patch.object(self.current_app.logger, 'debug') as mock_debug, \ + mock.patch.object(doc_match, 'create_and_return_response') as mock_create_response: + + mock_get_solr_data_match_doctype_case.return_value = ([{'bibcode': '2021MockBibcode.......A'}], 'mock_query_with_doctype', 200) + + result = doc_match.query_doctype(comment) + + mock_debug.assert_any_call('No result from solr for eprint.') + mock_get_solr_data_match_doctype_case.assert_called_once_with(payload['author'], payload['year'], payload['doctype'], '"%s"' % '" OR "'.join(payload['match_doctype'])) + mock_create_response.assert_called_once_with(match=None, query='mock_query_with_doctype', comment='some doctype query No result from solr for eprint.') + + def test_query_abstract_or_title(self): + """ + Test query_abstract_or_title function of DocMatching for several scenarios + """ + payload = { + 'abstract': 'Mock abstract text.', + 'title': 'Mock title text.', + 'doctype': 'article', + 'match_doctype': ['eprint'], + 'extra_filter': '' + } + comment = 'some query' + doc_match = DocMatching(payload) + + with mock.patch('oraclesrv.doc_matching.get_solr_data_match') as mock_get_solr_data_match, \ + mock.patch('oraclesrv.doc_matching.get_db_match') as mock_get_db_match, \ + mock.patch('oraclesrv.doc_matching.get_matches') as mock_get_matches, \ + mock.patch.object(self.current_app.logger, 'debug') as mock_debug, \ + mock.patch.object(doc_match, 'create_and_return_response') as mock_create_response: + + # when solr return error with abstract query + mock_get_solr_data_match.return_value = ([], 'mock_query_with_abstract', 400) + result = doc_match.query_abstract_or_title(comment) + mock_create_response.assert_any_call([], 'mock_query_with_abstract', 'status code: 400') + + # when solr errors no records with abstract but then errors on title query + mock_get_solr_data_match.side_effect = [ + ([], 'mock_query_with_abstract', 200), + ([], 'mock_query_with_title', 400), + ] + result = doc_match.query_abstract_or_title(comment) + mock_create_response.assert_any_call([], 'mock_query_with_title', 'status code: 400') + + # when there are results from solr, but there are no matches, after no matches, query again with title, and error + mock_get_solr_data_match.side_effect = [ + ([{'bibcode': '2000Bibcode.......A'}], 'mock_query_with_abstract', 200), + ([], 'mock_query_with_title', 400), + ] + mock_get_matches.return_value = [] + result = doc_match.query_abstract_or_title(comment) + mock_create_response.assert_any_call([], 'mock_query_with_title', 'status code: 400') + + # when abstract matches are returned from solr, but no match is found, query again with title, and no matches are found + # also no matches in the database + mock_get_solr_data_match.side_effect = [ + ([{'bibcode': '2000Bibcode.......A'}], 'mock_query_with_abstract', 200), + ([], 'mock_query_with_title', 200), + ] + mock_get_matches.return_value = [] + mock_get_db_match.return_value = [] + result = doc_match.query_abstract_or_title(comment) + mock_create_response.assert_any_call(match='', query='mock_query_with_title', comment='some query No matches with Abstract, trying Title. No result from solr with Title. No matches in database either.') + + # when abstract matches are returned from solr, but no match is found, query again with title, and no matches are found + # then query database and there is a matches in the database + mock_get_solr_data_match.side_effect = [ + ([{'bibcode': '2000Bibcode.......A'}], 'mock_query_with_abstract', 200), + ([], 'mock_query_with_title', 200), + ] + mock_get_matches.return_value = [] + mock_get_db_match.return_value = [{'bibcode': '2000Bibcode.......A'}] + result = doc_match.query_abstract_or_title(comment) + mock_create_response.assert_any_call([{'bibcode': '2000Bibcode.......A'}], 'mock_query_with_title', 'some query No matches with Abstract, trying Title. No result from solr with Title. Fetched from database.') + + # when abstract matches are returned from solr, but no match is found, query again with title, there are matches + mock_get_solr_data_match.side_effect = [ + ([{'bibcode': '2000Bibcode.......A'}], 'mock_query_with_abstract', 200), + ([{'bibcode': '2001Bibcode.......A'}], 'mock_query_with_title', 200), + ] + mock_get_matches.side_effect = [[], [{'bibcode': '2001Bibcode.......A'}]] + result = doc_match.query_abstract_or_title(comment) + mock_create_response.assert_any_call([{'bibcode': '2001Bibcode.......A'}], 'mock_query_with_title', 'some query No matches with Abstract, trying Title.') + if __name__ == "__main__": unittest.main() diff --git a/oraclesrv/views.py b/oraclesrv/views.py index 44ecd6e..f7d8547 100644 --- a/oraclesrv/views.py +++ b/oraclesrv/views.py @@ -224,6 +224,9 @@ def add(): if len(payload) == 0: return return_response({'error': 'no records received to update db'}, 400) + if len(payload) > current_app.config['ORACLE_MAX_RECORDS_ADD']: + return return_response({'error': 'too many records to add to db at one time, received %s records while the limit is %s'%(len(payload), current_app.config['ORACLE_MAX_RECORDS_ADD'])}, 400) + current_app.logger.info('received request to populate db with %d records' % (len(payload))) try: @@ -254,6 +257,9 @@ def delete(): if len(payload) == 0: return return_response({'error': 'no records received to delete from db'}, 400) + if len(payload) > current_app.config['ORACLE_MAX_RECORDS_DEL']: + return JsonResponse({'error': 'too many records to delete to db at one time, received %s records while the limit is %s'%(len(bibcodes), current_app.config['ORACLE_MAX_RECORDS_DEL'])}, 400) + current_app.logger.info('received request to delete from db %d bibcodes' % (len(payload))) try: