Skip to content

Commit

Permalink
Merge branch 'release/1.2'
Browse files Browse the repository at this point in the history
  • Loading branch information
fedelemantuano committed Nov 1, 2016
2 parents be0fa5f + a81c6c1 commit 87dc3c6
Show file tree
Hide file tree
Showing 17 changed files with 372 additions and 344 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ You can chose your mails input sources (with spouts) and your functionalities (w
You can build your custom output bolts and store your data in Elasticsearch, Mongo, filesystem, etc.

### Build your topology
With streamparse tecnology you can build your topology in Clojure, add and/or remove spouts and bolts.
With streamparse tecnology you can build your topology in Python, add and/or remove spouts and bolts.

### Apache 2 Open Source License
SpamScope can be downloaded, used, and modified free of charge. It is available under the Apache 2 license.
Expand Down Expand Up @@ -90,6 +90,8 @@ It's very importart pass configuration file to commands `sparse run` and `sparse
- `sparse run --name topology -o "spamscope_conf=/etc/spamscope/spamscope.yml"`
- `sparse submit -f --name topology -o "spamscope_conf=/etc/spamscope/spamscope.yml"`

If you use Elasticsearch output, I suggest you to use Elasticsearch template that comes with SpamScope.

### Apache Storm settings

It's possible change the default setting for all Apache Storm options. I suggest for SpamScope these options:
Expand Down
9 changes: 9 additions & 0 deletions conf/keywords/subjects.example.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
# Add suspect subjects with dash -

# keywords format:

# - word1 word2
# - word3
# - word4

# It's equal to:
# (word1 AND word2) OR word3 OR word4

- conferma
- bonifico
9 changes: 9 additions & 0 deletions conf/keywords/targets.example.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# For every target insert name and list of keywords with dash -

# keywords format:

# - word1 word2
# - word3
# - word4

# It's equal to:
# (word1 AND word2) OR word3 OR word4

Google:
- gmail
- google drive
Expand Down
7 changes: 0 additions & 7 deletions conf/spamscope.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ files-mails:
# Waiting new mails, sleep seconds
waiting.sleep: 1

# Max retry failed tuple
max.retry: 3

# Post processing
post_processing:

Expand Down Expand Up @@ -199,10 +196,6 @@ output-elasticsearch:
- "node1:9200"
- "node2"

# If your application is long-running consider turning on Sniffing
# to make sure the client is up to date on the cluster location.
sniffer.timeout: 60

# Prefix with dash '-'. SpamScope use a index for day
index.prefix.mails: spamscope_mails-
doc.type.mails: analysis
Expand Down
120 changes: 120 additions & 0 deletions conf/templates/spamscope.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
{
"order": 10,
"template": "spamscope_*-*",
"settings": {
"analysis": {
"analyzer": {
"header": {
"tokenizer": "uax_url_email",
"filter": [
"lowercase"
]
},
"html_body": {
"char_filter": [
"html_strip"
],
"tokenizer": "uax_url_email",
"filter": [
"lowercase"
]
},
"path_pattern": {
"tokenizer": "path_hierarchy",
"filter": [
"lowercase"
]
}
}
},
"index.codec": "best_compression",
"index.number_of_shards": 2,
"index.number_of_replicas": 0,
"index.refresh_interval": "5s"
},
"mappings": {
"_default_": {
"dynamic_templates": [
{
"hashes": {
"mapping": {
"index": "not_analyzed",
"type": "keyword",
"eager_global_ordinals": true
},
"match_pattern": "regex",
"match": "(^|.*\\.)(md5|sha1|sha256|sha512|ssdeep)"
}
},
{
"payload": {
"mapping": {
"type": "binary"
},
"match_pattern": "regex",
"match": "(^|.*\\.)(payload)"
}
},
{
"headers": {
"mapping": {
"index": "analyzed",
"type": "text",
"analyzer": "header"
},
"match_pattern": "regex",
"match": "(from|to|headers)"
}
},
{
"body": {
"mapping": {
"index": "analyzed",
"type": "text",
"analyzer": "html_body"
},
"match": "body"
}
},
{
"path_mail": {
"mapping": {
"index": "analyzed",
"type": "text",
"analyzer": "path_pattern"
},
"match": "path_mail"
}
},
{
"all_not_analyzed": {
"mapping": {
"index": "not_analyzed",
"type": "keyword",
"eager_global_ordinals": true
},
"match_pattern": "regex",
"match": "(^|.*\\.)(charset|mail_server|mailbox|message_id|Content-Type|content_transfer_encoding|mail_content_type)"
}
},
{
"all_text": {
"mapping": {
"index": "analyzed",
"type": "text",
"fields": {
"raw": {
"ignore_above": 256,
"index": "not_analyzed",
"type": "keyword",
"eager_global_ordinals": true
}
}
},
"match_mapping_type": "text"
}
}
]
}
}
}
6 changes: 3 additions & 3 deletions src/bolts/abstracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class AbstractUrlsHandlerBolt(AbstractBolt):

def initialize(self, stormconf, context):
super(AbstractUrlsHandlerBolt, self).initialize(stormconf, context)
self.extractor = UrlsExtractor()
self._extractor = UrlsExtractor()
self._load_whitelist()

def _load_whitelist(self):
Expand Down Expand Up @@ -105,8 +105,8 @@ def _extract_urls(self, text, conv_to_str=True):
urls = dict()

if text:
self.extractor.extract(text)
urls = self.extractor.urls_obj
self._extractor.extract(text)
urls = self._extractor.urls_obj
domains = urls.keys()

for d in domains:
Expand Down
6 changes: 3 additions & 3 deletions src/bolts/attachments.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ def _load_settings(self):
virustotal_api_key=self.conf["virustotal"]["api_key"],
tika_jar=self.conf["tika"]["path_jar"],
tika_memory_allocation=self.conf["tika"]["memory_allocation"],
tika_valid_content_types=self.tika_valid_content_types)
tika_valid_content_types=self._tika_valid_content_types)

def _load_lists(self):

# Load content types for details
self.tika_valid_content_types = set()
self._tika_valid_content_types = set()
if self.conf["tika"]["enabled"]:
self.log("Reloading content types list for Tika details")
for k, v in self.conf["tika"]["valid_content_types"].iteritems():
Expand All @@ -55,7 +55,7 @@ def _load_lists(self):
"Keywords content types \
details list '{}' not valid".format(k))
keywords = [i.lower() for i in keywords]
self.tika_valid_content_types |= set(keywords)
self._tika_valid_content_types |= set(keywords)
self.log("Content types Tika '{}' loaded".format(k))

# Load content types for blacklist
Expand Down
23 changes: 8 additions & 15 deletions src/bolts/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,11 @@ def process(self, tup):
is_filtered = tup.values[2]
with_form = False

try:
if not is_filtered and body.strip():
tree = html.fromstring(body)
results = tree.xpath('//form')
if results:
with_form = True
self.log("Forms for mail '{}'".format(sha256_random))

except Exception as e:
self.log("Failed parsing body part for mail '{}".format(
sha256_random), "error")
self.raise_exception(e, tup)

finally:
self.emit([sha256_random, with_form])
if not is_filtered and body.strip():
tree = html.fromstring(body)
results = tree.xpath('//form')
if results:
with_form = True
self.log("Forms for mail '{}'".format(sha256_random))

self.emit([sha256_random, with_form])
44 changes: 19 additions & 25 deletions src/bolts/json_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class JsonMaker(Bolt):
outputs = ['sha256_random', 'json']

def initialize(self, stormconf, context):
self.mails = {}
self._mails = {}
self.input_bolts = set(context['source->stream->grouping'].keys())

# Phishing bitmap
Expand Down Expand Up @@ -87,27 +87,21 @@ def _compose_output(self, greedy_data):
return mail

def process(self, tup):
try:
bolt = tup.component
sha256_random = tup.values[0]
values = tup.values

if self.mails.get(sha256_random, None):
self.mails[sha256_random][bolt] = values
else:
self.mails[sha256_random] = {bolt: values}

diff = self.input_bolts - set(self.mails[sha256_random].keys())
if not diff:
output_json = self._compose_output(
self.mails.pop(sha256_random))

self.log("New JSON for mail '{}'".format(
sha256_random), "debug")

self.emit([sha256_random, output_json])

except Exception as e:
self.log("Failed process json for mail: {}".format(
sha256_random), "error")
self.raise_exception(e, tup)
bolt = tup.component
sha256_random = tup.values[0]
values = tup.values

if self._mails.get(sha256_random, None):
self._mails[sha256_random][bolt] = values
else:
self._mails[sha256_random] = {bolt: values}

diff = self.input_bolts - set(self._mails[sha256_random].keys())
if not diff:
output_json = self._compose_output(
self._mails.pop(sha256_random))

self.log("New JSON for mail '{}'".format(
sha256_random), "debug")

self.emit([sha256_random, output_json])
28 changes: 11 additions & 17 deletions src/bolts/output_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,14 @@ def initialize(self, stormconf, context):
os.makedirs(self._output_path)

def process(self, tup):
try:
sha256_random = tup.values[0]
mail = json.dumps(
tup.values[1],
ensure_ascii=False,
indent=self._json_indent)

output = os.path.join(self._output_path, "{}.json".format(
sha256_random))

with open(output, "w") as f:
f.write(mail.encode('utf-8'))

except Exception as e:
self.log("Failed process json for mail: {}".format(
sha256_random), "error")
self.raise_exception(e, tup)
sha256_random = tup.values[0]
mail = json.dumps(
tup.values[1],
ensure_ascii=False,
indent=self._json_indent)

output = os.path.join(self._output_path, "{}.json".format(
sha256_random))

with open(output, "w") as f:
f.write(mail.encode('utf-8'))
Loading

0 comments on commit 87dc3c6

Please sign in to comment.