Added sigma-similarity tool

Fixed also bug in backend base class that was triggered by the way backends are used by this tool.
canaankao · Oct 25, 2019 · 30948b9 · 30948b9
1 parent a5ec672
commit 30948b9
Show file tree

Hide file tree

Showing 4 changed files with 173 additions and 49 deletions.
diff --git a/Pipfile b/Pipfile
@@ -12,6 +12,7 @@ elasticsearch = "*"
 elasticsearch-async = "*"
 pymisp = "*"
 PyYAML = ">=3.11"
+progressbar2 = "*"
 
 [requires]
 python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/tools/sigma-similarity b/tools/sigma-similarity
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# Calculates similarity of Sigma rules by transformation into a normalized
+# string form and calculation of a string distance.
+
+import argparse
+import pathlib
+import itertools
+import difflib
+
+import progressbar
+
+from sigma.parser.collection import SigmaCollectionParser
+from sigma.backends.base import SingleTextQueryBackend
+from sigma.configuration import SigmaConfiguration
+
+argparser = argparse.ArgumentParser(description="Calculate a similarity score between Sigma rules.")
+argparser.add_argument("--recursive", "-r", action="store_true", help="Recurse into directories")
+argparser.add_argument("--verbose", "-v", action="count", help="Be verbose. Use once more for debug output.")
+argparser.add_argument("--top", "-t", type=int, help="Only output the n most similar rule pairs.")
+argparser.add_argument("--min-similarity", "-m", type=int, help="Only output pairs with a similarity above this threshold (percent)")
+argparser.add_argument("inputs", nargs="+", help="Sigma input files")
+args = argparser.parse_args()
+
+def print_verbose(level, *args, **kwargs):
+    if args.verbose >= level:
+        print(*args, **kwargs)
+
+class SigmaNormalizationBackend(SingleTextQueryBackend):
+    """Normalization of a Sigma rule into a non-existing query language that supports all Sigma features"""
+    andToken = " AND "
+    orToken = " OR "
+    notToken = " NOT "
+    subExpression = "(%s)"
+    listExpression = "[%s]"
+    listSeparator = ","
+    valueExpression = "%s"
+    typedValueExpression = dict()
+    nullExpression = "NULL(%s)"
+    notNullExpression = "NOTNULL(%s)"
+    mapExpression = "{'%s':'%s'}"
+
+    sort_condition_lists = True
+
+    def generateListNode(self, node):
+        """Return sorted list"""
+        return super().generateListNode(list(sorted([ str(item) for item in node ])))
+
+    def generateTypedValueNode(self, node):
+        """Return normalized form of typed values"""
+        return "type_{}({})".format(node.identifier, str(node))
+
+    def generateAggregation(self, agg):
+        if agg.aggfunc_notrans == "near":
+            return " near in={} ex={}".format(str(agg.include), str(agg.exclude))
+        else:
+            return " | {}({}) by {} {} {}".format(agg.aggfunc_notrans, agg.aggfield, agg.groupfield, agg.cond_op, agg.condition)
+
+backend = SigmaNormalizationBackend(SigmaConfiguration())
+
+if args.recursive:
+    paths = [ p for pathname in args.inputs for p in pathlib.Path(pathname).glob("**/*") if p.is_file() ]
+else:
+    paths = [ pathlib.Path(pathname) for pathname in args.inputs ]
+
+parsed = {
+            str(path): SigmaCollectionParser(path.open().read())
+            for path in paths
+        }
+converted = {
+            str(path): list(sigma_collection.generate(backend))
+            for path, sigma_collection in parsed.items()
+        }
+converted_flat = (
+            (path, i, normalized)
+            for path, nlist in converted.items()
+            for i, normalized in zip(range(len(nlist)), nlist)
+        )
+converted_pairs = list(itertools.combinations(converted_flat, 2))
+similarities = [
+        (item1[:2], item2[:2], difflib.SequenceMatcher(None, item1[2], item2[2]).ratio())
+            for item1, item2 in progressbar.progressbar(converted_pairs)
+        ]
+
+i = 0
+for similarity in sorted(similarities, key=lambda s: s[2], reverse=True):
+    if args.min_similarity and similarity[2] * 100 < args.min_similarity:   # finish after similarity drops below minimum
+        break
+    print("{:70} | {:2} | {:70} | {:2} | {:>3.2%}".format(*similarity[0], *similarity[1], similarity[2]))
+    i += 1
+    if args.top and i >= args.top:  # end after $top pairs
+        break
diff --git a/tools/sigma/backends/base.py b/tools/sigma/backends/base.py
@@ -90,7 +90,7 @@ class BaseBackend:
     options = tuple()     # a list of tuples with following elements: option name, default value, help text, target attribute name (option name if None)
     config_required = True
 
-    def __init__(self, sigmaconfig, backend_options=None):
+    def __init__(self, sigmaconfig, backend_options=dict()):
         """
         Initialize backend. This gets a sigmaconfig object, which is notified about the used backend class by
         passing the object instance to it.
@@ -221,10 +221,14 @@ class SingleTextQueryBackend(RulenameCommentMixin, BaseBackend, QuoteCharMixin):
     mapListsSpecialHandling = False     # Same handling for map items with list values as for normal values (strings, integers) if True, generateMapItemListNode method is called with node
     mapListValueExpression = None       # Syntax for field/value condititons where map value is a list
 
+    sort_condition_lists = False        # Sort condition items for AND and OR conditions
+
     def generateANDNode(self, node):
         generated = [ self.generateNode(val) for val in node ]
         filtered = [ g for g in generated if g is not None ]
         if filtered:
+            if self.sort_condition_lists:
+                filtered = sorted(filtered)
             return self.andToken.join(filtered)
         else:
             return None
@@ -233,6 +237,8 @@ def generateORNode(self, node):
         generated = [ self.generateNode(val) for val in node ]
         filtered = [ g for g in generated if g is not None ]
         if filtered:
+            if self.sort_condition_lists:
+                filtered = sorted(filtered)
             return self.orToken.join(filtered)
         else:
             return None