Refactor & enhance

Restructure code into appropriate modules; switch to rom models; ntegrate semantic_similarity Signed-off-by: denim2x <denim2x@cyberdude.com>
denim2x · May 7, 2019 · e709730 · e709730
1 parent 711ca3d
commit e709730
Show file tree

Hide file tree

Showing 19 changed files with 638 additions and 345 deletions.
diff --git a/.gcloudignore b/.gcloudignore
@@ -13,3 +13,4 @@ package*.json
 /README*
 /LICENSE
 /account.json
+.idea
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,4 @@ package-lock.json
 /~*
 /logs
 /account.json
+.idea
diff --git a/README.md b/README.md
@@ -28,22 +28,27 @@ An engaging virtual assistant service for answering (almost) any question about
 
 ## Setup
 ### Requirements
-- (optional) *\<project root>/account.json* with valid *GCP service account* data
+- *Dialogflow* agent restored from `knowledge-agent.zip` (see *releases*):
+  - the *Fandom* knowledge base ('https://{0}.fandom.com/wiki/{1}') - *enabled*, identified by `Fandom KB ID` (the part after '.../editKnowledgeBase/');
 - *\<project root>/config.yaml* with the following:
 ```yaml
 google_api:
   key: <API key>   # for Custom Search
 
 custom_search:
   cx: <Custom Search ID>
-
+
+dialogflow:
+  fandom: <Fandom KB ID>
+
 redis:
   - host: <host>
     port: <port>
-    pass: <password>
+    pass: <password>  # optional
   - ...
 ```
+- (optional) *\<project root>/account.json* with valid *GCP service account* data.
 
-The Redis credentials are tried sequentially until a successful database connection is established.
+The Redis credentials are tried sequentially until the first successful database connection.
 
 ## MIT License
diff --git a/_bareasgi.py b/_bareasgi.py
@@ -0,0 +1,6 @@
+import bareasgi as _bareasgi
+from bareasgi import *
+
+def json_response(data, status=200, headers={}):
+  headers = [] # FIXME
+  return _bareasgi.json_response(status, headers, data)
diff --git a/app.js b/app.js
@@ -46,10 +46,10 @@ function get_knowledge() {
 }
 
 function send_message(text='', cb) {
-  request.post('/respond').send(text).then(({ body }) => {
-    state.conversation.push({ text: body[0] });
+  request.post('/message').send(text).then(({ body }) => {
+    state.conversation.push(body);
   }, (e) => { 
-    _error('POST', '/respond', e);
+    _error('POST', '/message', e);
     if (cb) {
       cb();
     }

diff --git a/config.py b/config.py
@@ -0,0 +1,17 @@
+import os, yaml, json
+
+from util import realpath
+
+
+with open(realpath('config.yaml')) as f:
+  locals().update(yaml.load(f, Loader=yaml.SafeLoader))
+
+project_id = os.getenv('GOOGLE_CLOUD_PROJECT', None)
+if not project_id:
+  with open(realpath('account.json')) as f:
+    _account = json.load(f)
+    project_id = _account['project_id']
+
+__all__ = (
+  'google_api', 'custom_search', 'dialogflow', 'redis', 'project_id'
+)
diff --git a/dialogflow.py b/dialogflow.py
@@ -1,11 +1,12 @@
+import os
 from uuid import uuid4
 
 from dialogflow_v2beta1 import SessionsClient, KnowledgeBasesClient, DocumentsClient
-from dialogflow_v2beta1 import types as dialogflow
-from dialogflow_v2beta1 import enums
+from dialogflow_v2beta1 import types, enums
 from google.api_core.exceptions import InvalidArgument, GoogleAPICallError
 
-from util import *
+from util import realpath
+from config import project_id
 
 EXTRACTIVE_QA = [enums.Document.KnowledgeType.EXTRACTIVE_QA]
 _account = realpath('account.json')
@@ -17,6 +18,30 @@
   session = SessionsClient()
   kb = KnowledgeBasesClient()
   docs = DocumentsClient()
+
+class KnowledgeBase:
+  def __init__(self, id):
+    if isinstance(id, types.KnowledgeBase):
+      self._path = id.name
+      self.caption = id.display_name
+    else:
+      self._path = kb.knowledge_base_path(project_id, str(id))
+      self.caption = kb.get_knowledge_base(self._path).display_name
+
+  def __iter__(self):
+    yield from docs.list_documents(self._path)
+
+  def create(self, caption, text=None):
+    if text is None:
+      caption, text = caption
+    doc = types.Document(
+      display_name=caption, mime_type='text/plain',
+      knowledge_types=EXTRACTIVE_QA, content=text)
+    try:
+      return docs.create_document(self._path, doc).result()
+    except (InvalidArgument, GoogleAPICallError):
+      res = [d for d in self if d.display_name == caption]
+      return res[0] if res else None
 
 class Dialogflow:
   def __init__(self, session_id=uuid4(), language_code='en'):
@@ -26,27 +51,14 @@ def __init__(self, session_id=uuid4(), language_code='en'):
     self.language_code = language_code
     self.min_confidence = 0.8
 
-  def init(self, name):
-    return kb.create_knowledge_base(self._kb, dialogflow.KnowledgeBase(display_name=name))
-
-  def store(self, container, title, text):
-    doc = dialogflow.Document(
-      display_name=title, mime_type='text/plain', 
-      knowledge_types=EXTRACTIVE_QA, content=text)
-    try:
-      return docs.create_document(container, doc).result()
-    except (InvalidArgument, GoogleAPICallError):
-      res = [d for d in self.documents(container) if d.display_name == title]
-      return res[0] if res else None
-
   def __call__(self, text=None, event=None):
     language_code = self.language_code
     if text is not None:
-      text_input = dialogflow.TextInput(text=text, language_code=language_code)
-      query_input = dialogflow.QueryInput(text=text_input)
+      text_input = types.TextInput(text=text, language_code=language_code)
+      query_input = types.QueryInput(text=text_input)
     elif event is not None:
-      event_input = dialogflow.EventInput(name=event, language_code=language_code)
-      query_input = dialogflow.QueryInput(event=event_input)
+      event_input = types.EventInput(name=event, language_code=language_code)
+      query_input = types.QueryInput(event=event_input)
     else:
       return None
     return session.detect_intent(session=self._session, query_input=query_input)
@@ -72,11 +84,6 @@ def event(self, name, raw=False):
     res = self(event=name)
     return res if raw else res.query_result.fulfillment_text
 
-  def knowledge_bases(self):
-    return kb.list_knowledge_bases(self._kb)
-
-  def documents(self, container):
-    name = container
-    if not isinstance(container, str):
-      name = container.name
-    return docs.list_documents(name)
+  def __iter__(self):
+    for item in kb.list_knowledge_bases(self._kb):
+      yield KnowledgeBase(item)
diff --git a/document.py b/document.py
@@ -0,0 +1,106 @@
+from urllib.error import HTTPError
+from collections import namedtuple
+
+from util import pq, List
+
+
+_excludes = (
+  'Recommended_Readings',
+  'See_Also',
+  'Residents',
+  'Paraphernalia',
+  'Alternate_Reality_Versions'
+)
+
+scrape_excludes = List(
+  [
+    *_excludes,
+    'Links_and_References',
+    'References',
+    'Points_of_Interest',
+    'Links'
+  ],
+  format=':not(#{item})',
+  str=''
+)
+
+Fragment = namedtuple('Fragment', ('caption', 'text'))
+
+def _text(el, to_strip=None):
+  if el is None:
+    return None
+  return el.text().strip().strip(to_strip).strip()
+
+class Document:
+  def __init__(self, url=None, name=None, quotes=False):
+    if name is not None:
+      url = url.format(*name.split('|'))
+      self.name = name
+    else:
+      self.name = '|'.join([url.subdomain, url.basename])
+
+    self.url = str(url)
+    try:
+      doc = pq(url=self.url)
+    except HTTPError:
+      doc = pq([])
+    self.caption = doc.children('head > title').text().split('|', 1)[0].strip()
+    self.site = doc.find('link[rel="search').attr('title').rstrip('(en)').strip()
+    self._doc = doc
+    self.__content = None
+    self._data = None
+    self._quotes = quotes
+    sel = List(['h3, p, ul, ol'])
+    if self._quotes:
+      sel.append('.quote')
+    self._sel = str(sel)
+
+  def __bool__(self):
+    return bool(self._doc)
+
+  def _content(self):
+    if self.__content is None:
+      content = self._doc.find('.mw-content-text')
+      content.find('.noprint, noscript, script, style, link, iframe, embed, video, img, .editsection').remove()
+      content.find('*').remove_attr('style')
+      self.__content = content
+    return self.__content
+
+  def __iter__(self):
+    if not self:
+      return
+
+    if self._data is not None:
+      yield from self._data
+
+    self._data = []
+    content = self._content()
+    content.find('.reference').remove()
+    if self._quotes:
+      for quote in content.find('.quote').items():
+        author = quote.find('.selflink').closest('b')
+        author.closest('dl').remove()
+        _quote = quote.find('i')
+        _quote.text('"' + _text(_quote, '"\'') + '"')
+        author.append('said').prependTo(_quote.closest('dd'))
+
+    h2_list = content.children(f'h2{scrape_excludes} > {scrape_excludes}').closest('h2')
+    for h2 in h2_list.items():
+      self._append(h2.nextUntil('h2, h3', self._sel), h2)
+      for h3 in h2.nextUntil('h2', 'h3'):
+        self._append(h3.nextUntil('h2, h3', self._sel), h2, h3)
+
+  def _append(self, body, *heads):
+    _data = self._data
+    if _data is None or not body:
+      return False
+
+    caption = List((_text(h) for h in heads), str='/')
+    text = List((_text(e) for e in body.items()), False, str='\n')
+    _data.append(Fragment(f"{self.name}#{caption}", str(text)))
+    return True
+
+  @staticmethod
+  def parse_name(name):
+    name, heads = name.split('#')
+    return name, heads.split('/')
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,3 +11,4 @@ package-lock.json @@
     /~*
     /logs
     /account.json
+    .idea