From 8b6f966eb08911da5d66441beaa5301f28a31476 Mon Sep 17 00:00:00 2001
From: kseen <korenevdn@gmail.com>
Date: Thu, 8 Aug 2024 02:26:43 +0300
Subject: [PATCH] Add Incremental weaving

---
 README.md                                 |  2 +-
 hashes/README.md.hash                     |  1 +
 hashes/chatgpt.com.csv.hash               |  1 +
 hashes/db.csv.hash                        |  1 +
 hashes/discord.csv.hash                   |  1 +
 hashes/facebook.com.csv.hash              |  1 +
 hashes/instagram.com.csv.hash             |  1 +
 hashes/meta.csv.hash                      |  1 +
 hashes/microsoft.csv.hash                 |  1 +
 hashes/netflix.csv.hash                   |  1 +
 hashes/x.com.csv.hash                     |  1 +
 hashes/youtube.com.csv.hash               |  1 +
 in/known/{netflix.com.csv => netflix.csv} |  0
 src/common.py                             | 65 ++++++++++++++++++++++-
 src/make_amnezia.py                       |  2 +-
 src/make_route_bat.py                     |  1 +
 src/make_wireguard.py                     |  1 +
 src/sort_db.py                            | 54 +++++++++++++------
 src/sort_readme.py                        | 10 ++++
 19 files changed, 128 insertions(+), 18 deletions(-)
 create mode 100644 hashes/README.md.hash
 create mode 100644 hashes/chatgpt.com.csv.hash
 create mode 100644 hashes/db.csv.hash
 create mode 100644 hashes/discord.csv.hash
 create mode 100644 hashes/facebook.com.csv.hash
 create mode 100644 hashes/instagram.com.csv.hash
 create mode 100644 hashes/meta.csv.hash
 create mode 100644 hashes/microsoft.csv.hash
 create mode 100644 hashes/netflix.csv.hash
 create mode 100644 hashes/x.com.csv.hash
 create mode 100644 hashes/youtube.com.csv.hash
 rename in/known/{netflix.com.csv => netflix.csv} (100%)

diff --git a/README.md b/README.md
index 2e0edd0..97d21e2 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ NONE
 - facebook.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
 - instagram.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
 - meta - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
-- netflix.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
+- netflix - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
 - openai.com
 - x.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
 - youtube.com - from [[x]](https://github.com/touhidurrr/iplist-youtube?tab=readme-ov-file) [[x]](https://www.gstatic.com/ipranges/goog.json) [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov)
diff --git a/hashes/README.md.hash b/hashes/README.md.hash
new file mode 100644
index 0000000..8fee22b
--- /dev/null
+++ b/hashes/README.md.hash
@@ -0,0 +1 @@
+8zïa¯Üü~|ÇÇhBY÷î€Aïm6q³9áSC{¶b
\ No newline at end of file
diff --git a/hashes/chatgpt.com.csv.hash b/hashes/chatgpt.com.csv.hash
new file mode 100644
index 0000000..6d6792d
--- /dev/null
+++ b/hashes/chatgpt.com.csv.hash
@@ -0,0 +1 @@
+=â`¹Zq&|Œü°%SsmêÂÎ¼dBD½h—
\ No newline at end of file
diff --git a/hashes/db.csv.hash b/hashes/db.csv.hash
new file mode 100644
index 0000000..1b361dc
--- /dev/null
+++ b/hashes/db.csv.hash
@@ -0,0 +1 @@
+ öðÑ}³vÌµJõ›¯ÍRìñ<kd–]EQcÚd"ÙG
\ No newline at end of file
diff --git a/hashes/discord.csv.hash b/hashes/discord.csv.hash
new file mode 100644
index 0000000..3a61957
--- /dev/null
+++ b/hashes/discord.csv.hash
@@ -0,0 +1 @@
+»0på«·oü„J= £$MÃøìÅÓDöHN˜…>%Ù
\ No newline at end of file
diff --git a/hashes/facebook.com.csv.hash b/hashes/facebook.com.csv.hash
new file mode 100644
index 0000000..050ca0c
--- /dev/null
+++ b/hashes/facebook.com.csv.hash
@@ -0,0 +1 @@
+üqw?QCåél¢àLrùÈÞPÓôü³FÂ¯_*)’
\ No newline at end of file
diff --git a/hashes/instagram.com.csv.hash b/hashes/instagram.com.csv.hash
new file mode 100644
index 0000000..a5bf596
--- /dev/null
+++ b/hashes/instagram.com.csv.hash
@@ -0,0 +1 @@
+5ì…¿…>—¦œ(ãH`›OÐ`lÇ®Ëw›ðS4þ
\ No newline at end of file
diff --git a/hashes/meta.csv.hash b/hashes/meta.csv.hash
new file mode 100644
index 0000000..d410c36
--- /dev/null
+++ b/hashes/meta.csv.hash
@@ -0,0 +1 @@
+Òj"m“$é˜åLï¯a“á=µf\æ­òa"òùBc
\ No newline at end of file
diff --git a/hashes/microsoft.csv.hash b/hashes/microsoft.csv.hash
new file mode 100644
index 0000000..f30bcb3
--- /dev/null
+++ b/hashes/microsoft.csv.hash
@@ -0,0 +1 @@
+ê?hÀiIlñ%ÍÑµò™ˆˆN†¢Åµœž;«´{
\ No newline at end of file
diff --git a/hashes/netflix.csv.hash b/hashes/netflix.csv.hash
new file mode 100644
index 0000000..5970284
--- /dev/null
+++ b/hashes/netflix.csv.hash
@@ -0,0 +1 @@
+Òì6Ij˜QÀâç÷ BËÎÀù÷D˜ @^
\ No newline at end of file
diff --git a/hashes/x.com.csv.hash b/hashes/x.com.csv.hash
new file mode 100644
index 0000000..0c002b5
--- /dev/null
+++ b/hashes/x.com.csv.hash
@@ -0,0 +1 @@
+õÈÜ‹®=|Ý×½œ³ò¹µ/ŒM¾Ç&Tûsaš
\ No newline at end of file
diff --git a/hashes/youtube.com.csv.hash b/hashes/youtube.com.csv.hash
new file mode 100644
index 0000000..b09ffce
--- /dev/null
+++ b/hashes/youtube.com.csv.hash
@@ -0,0 +1 @@
+F-ä­%Ô÷AÅw}bùh¯‚qzÙ”á‡›…À¿Y|
\ No newline at end of file
diff --git a/in/known/netflix.com.csv b/in/known/netflix.csv
similarity index 100%
rename from in/known/netflix.com.csv
rename to in/known/netflix.csv
diff --git a/src/common.py b/src/common.py
index e220761..d743db1 100644
--- a/src/common.py
+++ b/src/common.py
@@ -2,6 +2,7 @@
 import json
 import colorama
 import datetime
+from hashlib import sha256
 
 
 DB_FILE = 'db.csv'
@@ -120,4 +121,66 @@ def log_error(msg):
         msg (str): Error message
     """
     if LOG_LEVEL >= LOG_LEVELS['ERROR']:
-        print(f'{colorama.Fore.RED}{datetime.datetime.now()} [ERROR] {msg}{colorama.Style.RESET_ALL}')
\ No newline at end of file
+        print(f'{colorama.Fore.RED}{datetime.datetime.now()} [ERROR] {msg}{colorama.Style.RESET_ALL}')
+
+
+def hash_file(filename):
+    """Hash file
+
+    Args:
+        filename (str): Name of the file to hash
+
+    Returns:
+        bytes: Hash of the file in bytes
+    """
+    with open(filename, 'rb') as f:
+        return sha256(f.read()).digest()
+    
+
+def hash_str(string):
+    """_summary_ Hash string
+
+    Args:
+        string (str): String to hash
+
+    Returns:
+        str: Hash of the string
+    """
+    return sha256(string.encode()).digest()
+
+
+def save_hash_binary(new_hash_bytes, new_hash_filename):
+    """_summary_ Save hash of binary data to file
+
+    Args:
+        data (bytes): Binary data to hash
+        filename (str): Name of the file to save hash
+    """
+    with open(new_hash_filename, 'wb') as f:
+        f.write(new_hash_bytes)
+    log_info(f'Saved hash to {new_hash_filename}')
+
+    
+def read_file_binary(filename):
+    """_summary_ Read binary data from file
+
+    Args:
+        filename (str): Name of the file to read data
+
+    Returns:
+        bytes: Binary data read from file
+    """
+    with open(filename, 'rb') as f:
+        return f.read()
+    
+def check_hash_binary(new_hash_bytes, old_hash_filename):
+    """_summary_ Check hash of binary data
+
+    Args:
+        data (bytes): Binary data to hash
+        filename (str): Name of the file to check hash
+
+    Returns:
+        bool: Whether the hash matches
+    """
+    return new_hash_bytes == read_file_binary(old_hash_filename)
\ No newline at end of file
diff --git a/src/make_amnezia.py b/src/make_amnezia.py
index ce215a7..fd69927 100644
--- a/src/make_amnezia.py
+++ b/src/make_amnezia.py
@@ -1,8 +1,8 @@
 from common import *
 
-
 def make_amnezia():
     log_info('make_amnezia: Starting')
+        
     data = read_csv(DB_FILE)
     # use ipv4 as hostname
     data['hostname'] = data['ipv4']
diff --git a/src/make_route_bat.py b/src/make_route_bat.py
index 2337173..aa5233a 100644
--- a/src/make_route_bat.py
+++ b/src/make_route_bat.py
@@ -30,6 +30,7 @@
 
 def make_route_bat():
     log_info('make_route_bat: Starting')
+
     data = read_csv(DB_FILE)
     # if ipv4 contains / then it is a masked ip range
     masked = data[data['ipv4'].str.contains('/')]
diff --git a/src/make_wireguard.py b/src/make_wireguard.py
index 7816ece..284b9f3 100644
--- a/src/make_wireguard.py
+++ b/src/make_wireguard.py
@@ -12,6 +12,7 @@
 
 def make_wireguard():
     log_info('make_wireguard: Starting')
+
     data = read_csv(DB_FILE)
     file_str = ""
     # add header
diff --git a/src/sort_db.py b/src/sort_db.py
index 7937af5..a73d70a 100644
--- a/src/sort_db.py
+++ b/src/sort_db.py
@@ -34,13 +34,24 @@ def drop_duplicates(data):
 
 def sort_db():
     log_info('sort_db: Starting')
+    # save_hash_binary(hash_file(DB_FILE), './hashes/' + DB_FILE + '.hash')
+    if os.path.exists('./hashes/' + DB_FILE + '.hash'):
+        if not check_hash_binary(hash_file(DB_FILE), './hashes/' + DB_FILE + '.hash'):
+            log_warning('Database file has been modified')
+        else:
+            log_info('Database file has not been modified')
+            return
+    else:
+        log_warning('No hash file found for database')
     data = read_csv(DB_FILE)
     data = data.sort_values(by=['hostname', 'ipv4', 'comment'])
     data = data.drop_duplicates(subset=['ipv4'])
     data = drop_duplicates(data)
     write_csv(data, DB_FILE)
     log_happy('Database sorted')
+    save_hash_binary(hash_file(DB_FILE), './hashes/' + DB_FILE)
     log_info('sort_db: Finished')
+    
 
 
 def drop_duplicates_in_known(data):
@@ -52,24 +63,27 @@ def drop_duplicates_in_known(data):
 
     log_info(f"Initial CIDR data count: {len(cidr_data)}")
     log_info(f"Initial non-CIDR data count: {len(not_cidr_data)}")
+    if len(not_cidr_data) == 0:
+        log_info("No non-CIDR data found")
+    else:
+        cidr_ips = set()
+        for cidr in cidr_data.iloc[:, 0]:
+            try:
+                ip_network = ipaddress.ip_network(cidr, strict=False)
+                cidr_ips.update(str(ip) for ip in ip_network.hosts())
+            except ValueError as e:
+                log_warning(f"Invalid CIDR notation {cidr}: {e}")
+                # remove invalid CIDR
+                cidr_data = cidr_data[~(cidr_data.iloc[:, 0].astype(str) == cidr)]
+                log_info(f'Dropped {cidr} because it is invalid CIDR notation')
 
-    cidr_ips = set()
-    for cidr in cidr_data.iloc[:, 0]:
-        try:
-            ip_network = ipaddress.ip_network(cidr, strict=False)
-            cidr_ips.update(str(ip) for ip in ip_network.hosts())
-        except ValueError as e:
-            log_warning(f"Invalid CIDR notation {cidr}: {e}")
-            # remove invalid CIDR
-            cidr_data = cidr_data[~(cidr_data.iloc[:, 0].astype(str) == cidr)]
-            log_info(f'Dropped {cidr} because it is invalid CIDR notation')
 
+        not_cidr_data = not_cidr_data[~not_cidr_data.iloc[:, 0].astype(str).isin(cidr_ips)]
 
-    not_cidr_data = not_cidr_data[~not_cidr_data.iloc[:, 0].astype(str).isin(cidr_ips)]
+        dropped_ips = set(original_data.iloc[:, 0].astype(str)) - set(not_cidr_data.iloc[:, 0].astype(str)) - set(cidr_data.iloc[:, 0].astype(str))
+        for ip in dropped_ips:
+            log_info(f'Dropped {ip} because it is included in a CIDR range')
 
-    dropped_ips = set(original_data.iloc[:, 0].astype(str)) - set(not_cidr_data.iloc[:, 0].astype(str)) - set(cidr_data.iloc[:, 0].astype(str))
-    for ip in dropped_ips:
-        log_info(f'Dropped {ip} because it is included in a CIDR range')
 
     data = pd.concat([not_cidr_data, cidr_data], ignore_index=True)
     if len(original_data) != len(data):
@@ -86,9 +100,18 @@ def sort_known():
     fname = 'fake_name'
     for file in onlyfiles:
         log_info(f"Processing file: {file}")
+        if os.path.exists(f'./hashes/{file}.hash'):
+            if not check_hash_binary(hash_file(f'in/known/{file}'), f'./hashes/{file}.hash'):
+                log_warning(f'{file} has been modified')
+            else:
+                log_info(f'{file} has not been modified')
+                continue
+        else:
+            log_warning(f'No hash file found for {file}')
+
         data = read_txt_lbl(f'in/known/{file}')
         # add first line with column name
-        data = [fname] + data
+        # data = [fname] + data
         # print(data)
 
         # exit(0)
@@ -101,6 +124,7 @@ def sort_known():
         # drop all line with fname
         data = data[data[data.columns[0]] != fname]
         write_txt(data.iloc[:, 0].tolist(), f'in/known/{file}')
+        save_hash_binary(hash_file(f'in/known/{file}'), f'./hashes/{file}.hash')
         log_happy(f'{file} sorted')
     
     log_info("sort_known: Finished")
diff --git a/src/sort_readme.py b/src/sort_readme.py
index 4fc1381..2a3ed1d 100644
--- a/src/sort_readme.py
+++ b/src/sort_readme.py
@@ -1,11 +1,20 @@
 from common import *
 import re
+import os
 # catch <div id="auto-sort-start"/> and <div id="auto-sort-end"/> in README.md
 # sort the lines between them alphabetically
 
 
 def sort_readme():
     log_info('sort_readme: Starting')
+    if os.path.exists('./hashes/README.md.hash'):
+        if not check_hash_binary(hash_file('README.md'), './hashes/README.md.hash'):
+            log_warning('README.md has been modified')
+        else:
+            log_info('README.md has not been modified')
+            return
+    else:
+        log_warning('No hash file found for README.md')
     
     with open('README.md', 'r') as f:
         readme = f.readlines()
@@ -32,6 +41,7 @@ def sort_readme():
         f.writelines(sorted_readme)
     
     log_happy('README.md sorted')
+    save_hash_binary(hash_file('README.md'), './hashes/README.md.hash')
     log_info('sort_readme: Finished')