Skip to content
This repository has been archived by the owner on Apr 6, 2023. It is now read-only.

Commit

Permalink
Refactor & enhance
Browse files Browse the repository at this point in the history
Restructure code into appropriate modules; switch to rom models; ntegrate semantic_similarity

Signed-off-by: denim2x <denim2x@cyberdude.com>
  • Loading branch information
denim2x committed May 7, 2019
1 parent 711ca3d commit e709730
Show file tree
Hide file tree
Showing 19 changed files with 638 additions and 345 deletions.
1 change: 1 addition & 0 deletions .gcloudignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ package*.json
/README*
/LICENSE
/account.json
.idea
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ package-lock.json
/~*
/logs
/account.json
.idea
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,27 @@ An engaging virtual assistant service for answering (almost) any question about

## Setup
### Requirements
- (optional) *\<project root>/account.json* with valid *GCP service account* data
- *Dialogflow* agent restored from `knowledge-agent.zip` (see *releases*):
- the *Fandom* knowledge base ('https://{0}.fandom.com/wiki/{1}') - *enabled*, identified by `Fandom KB ID` (the part after '.../editKnowledgeBase/');
- *\<project root>/config.yaml* with the following:
```yaml
google_api:
key: <API key> # for Custom Search

custom_search:
cx: <Custom Search ID>


dialogflow:
fandom: <Fandom KB ID>

redis:
- host: <host>
port: <port>
pass: <password>
pass: <password> # optional
- ...
```
- (optional) *\<project root>/account.json* with valid *GCP service account* data.
The Redis credentials are tried sequentially until a successful database connection is established.
The Redis credentials are tried sequentially until the first successful database connection.
## MIT License
6 changes: 6 additions & 0 deletions _bareasgi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import bareasgi as _bareasgi
from bareasgi import *

def json_response(data, status=200, headers={}):
headers = [] # FIXME
return _bareasgi.json_response(status, headers, data)
6 changes: 3 additions & 3 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ function get_knowledge() {
}

function send_message(text='', cb) {
request.post('/respond').send(text).then(({ body }) => {
state.conversation.push({ text: body[0] });
request.post('/message').send(text).then(({ body }) => {
state.conversation.push(body);
}, (e) => {
_error('POST', '/respond', e);
_error('POST', '/message', e);
if (cb) {
cb();
}
Expand Down
17 changes: 17 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os, yaml, json

from util import realpath


with open(realpath('config.yaml')) as f:
locals().update(yaml.load(f, Loader=yaml.SafeLoader))

project_id = os.getenv('GOOGLE_CLOUD_PROJECT', None)
if not project_id:
with open(realpath('account.json')) as f:
_account = json.load(f)
project_id = _account['project_id']

__all__ = (
'google_api', 'custom_search', 'dialogflow', 'redis', 'project_id'
)
63 changes: 35 additions & 28 deletions dialogflow.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os
from uuid import uuid4

from dialogflow_v2beta1 import SessionsClient, KnowledgeBasesClient, DocumentsClient
from dialogflow_v2beta1 import types as dialogflow
from dialogflow_v2beta1 import enums
from dialogflow_v2beta1 import types, enums
from google.api_core.exceptions import InvalidArgument, GoogleAPICallError

from util import *
from util import realpath
from config import project_id

EXTRACTIVE_QA = [enums.Document.KnowledgeType.EXTRACTIVE_QA]
_account = realpath('account.json')
Expand All @@ -17,6 +18,30 @@
session = SessionsClient()
kb = KnowledgeBasesClient()
docs = DocumentsClient()

class KnowledgeBase:
def __init__(self, id):
if isinstance(id, types.KnowledgeBase):
self._path = id.name
self.caption = id.display_name
else:
self._path = kb.knowledge_base_path(project_id, str(id))
self.caption = kb.get_knowledge_base(self._path).display_name

def __iter__(self):
yield from docs.list_documents(self._path)

def create(self, caption, text=None):
if text is None:
caption, text = caption
doc = types.Document(
display_name=caption, mime_type='text/plain',
knowledge_types=EXTRACTIVE_QA, content=text)
try:
return docs.create_document(self._path, doc).result()
except (InvalidArgument, GoogleAPICallError):
res = [d for d in self if d.display_name == caption]
return res[0] if res else None

class Dialogflow:
def __init__(self, session_id=uuid4(), language_code='en'):
Expand All @@ -26,27 +51,14 @@ def __init__(self, session_id=uuid4(), language_code='en'):
self.language_code = language_code
self.min_confidence = 0.8

def init(self, name):
return kb.create_knowledge_base(self._kb, dialogflow.KnowledgeBase(display_name=name))

def store(self, container, title, text):
doc = dialogflow.Document(
display_name=title, mime_type='text/plain',
knowledge_types=EXTRACTIVE_QA, content=text)
try:
return docs.create_document(container, doc).result()
except (InvalidArgument, GoogleAPICallError):
res = [d for d in self.documents(container) if d.display_name == title]
return res[0] if res else None

def __call__(self, text=None, event=None):
language_code = self.language_code
if text is not None:
text_input = dialogflow.TextInput(text=text, language_code=language_code)
query_input = dialogflow.QueryInput(text=text_input)
text_input = types.TextInput(text=text, language_code=language_code)
query_input = types.QueryInput(text=text_input)
elif event is not None:
event_input = dialogflow.EventInput(name=event, language_code=language_code)
query_input = dialogflow.QueryInput(event=event_input)
event_input = types.EventInput(name=event, language_code=language_code)
query_input = types.QueryInput(event=event_input)
else:
return None
return session.detect_intent(session=self._session, query_input=query_input)
Expand All @@ -72,11 +84,6 @@ def event(self, name, raw=False):
res = self(event=name)
return res if raw else res.query_result.fulfillment_text

def knowledge_bases(self):
return kb.list_knowledge_bases(self._kb)

def documents(self, container):
name = container
if not isinstance(container, str):
name = container.name
return docs.list_documents(name)
def __iter__(self):
for item in kb.list_knowledge_bases(self._kb):
yield KnowledgeBase(item)
106 changes: 106 additions & 0 deletions document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from urllib.error import HTTPError
from collections import namedtuple

from util import pq, List


_excludes = (
'Recommended_Readings',
'See_Also',
'Residents',
'Paraphernalia',
'Alternate_Reality_Versions'
)

scrape_excludes = List(
[
*_excludes,
'Links_and_References',
'References',
'Points_of_Interest',
'Links'
],
format=':not(#{item})',
str=''
)

Fragment = namedtuple('Fragment', ('caption', 'text'))

def _text(el, to_strip=None):
if el is None:
return None
return el.text().strip().strip(to_strip).strip()

class Document:
def __init__(self, url=None, name=None, quotes=False):
if name is not None:
url = url.format(*name.split('|'))
self.name = name
else:
self.name = '|'.join([url.subdomain, url.basename])

self.url = str(url)
try:
doc = pq(url=self.url)
except HTTPError:
doc = pq([])
self.caption = doc.children('head > title').text().split('|', 1)[0].strip()
self.site = doc.find('link[rel="search').attr('title').rstrip('(en)').strip()
self._doc = doc
self.__content = None
self._data = None
self._quotes = quotes
sel = List(['h3, p, ul, ol'])
if self._quotes:
sel.append('.quote')
self._sel = str(sel)

def __bool__(self):
return bool(self._doc)

def _content(self):
if self.__content is None:
content = self._doc.find('.mw-content-text')
content.find('.noprint, noscript, script, style, link, iframe, embed, video, img, .editsection').remove()
content.find('*').remove_attr('style')
self.__content = content
return self.__content

def __iter__(self):
if not self:
return

if self._data is not None:
yield from self._data

self._data = []
content = self._content()
content.find('.reference').remove()
if self._quotes:
for quote in content.find('.quote').items():
author = quote.find('.selflink').closest('b')
author.closest('dl').remove()
_quote = quote.find('i')
_quote.text('"' + _text(_quote, '"\'') + '"')
author.append('said').prependTo(_quote.closest('dd'))

h2_list = content.children(f'h2{scrape_excludes} > {scrape_excludes}').closest('h2')
for h2 in h2_list.items():
self._append(h2.nextUntil('h2, h3', self._sel), h2)
for h3 in h2.nextUntil('h2', 'h3'):
self._append(h3.nextUntil('h2, h3', self._sel), h2, h3)

def _append(self, body, *heads):
_data = self._data
if _data is None or not body:
return False

caption = List((_text(h) for h in heads), str='/')
text = List((_text(e) for e in body.items()), False, str='\n')
_data.append(Fragment(f"{self.name}#{caption}", str(text)))
return True

@staticmethod
def parse_name(name):
name, heads = name.split('#')
return name, heads.split('/')
Loading

0 comments on commit e709730

Please sign in to comment.