From 8b6f966eb08911da5d66441beaa5301f28a31476 Mon Sep 17 00:00:00 2001 From: kseen Date: Thu, 8 Aug 2024 02:26:43 +0300 Subject: [PATCH] Add Incremental weaving --- README.md | 2 +- hashes/README.md.hash | 1 + hashes/chatgpt.com.csv.hash | 1 + hashes/db.csv.hash | 1 + hashes/discord.csv.hash | 1 + hashes/facebook.com.csv.hash | 1 + hashes/instagram.com.csv.hash | 1 + hashes/meta.csv.hash | 1 + hashes/microsoft.csv.hash | 1 + hashes/netflix.csv.hash | 1 + hashes/x.com.csv.hash | 1 + hashes/youtube.com.csv.hash | 1 + in/known/{netflix.com.csv => netflix.csv} | 0 src/common.py | 65 ++++++++++++++++++++++- src/make_amnezia.py | 2 +- src/make_route_bat.py | 1 + src/make_wireguard.py | 1 + src/sort_db.py | 54 +++++++++++++------ src/sort_readme.py | 10 ++++ 19 files changed, 128 insertions(+), 18 deletions(-) create mode 100644 hashes/README.md.hash create mode 100644 hashes/chatgpt.com.csv.hash create mode 100644 hashes/db.csv.hash create mode 100644 hashes/discord.csv.hash create mode 100644 hashes/facebook.com.csv.hash create mode 100644 hashes/instagram.com.csv.hash create mode 100644 hashes/meta.csv.hash create mode 100644 hashes/microsoft.csv.hash create mode 100644 hashes/netflix.csv.hash create mode 100644 hashes/x.com.csv.hash create mode 100644 hashes/youtube.com.csv.hash rename in/known/{netflix.com.csv => netflix.csv} (100%) diff --git a/README.md b/README.md index 2e0edd0..97d21e2 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ NONE - facebook.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov) - instagram.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov) - meta - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov) -- netflix.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov) +- netflix - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov) - openai.com - x.com - from [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov) - youtube.com - from [[x]](https://github.com/touhidurrr/iplist-youtube?tab=readme-ov-file) [[x]](https://www.gstatic.com/ipranges/goog.json) [[x]](https://rockblack.su/vpn/dopolnitelno/diapazon-ip-adresov) diff --git a/hashes/README.md.hash b/hashes/README.md.hash new file mode 100644 index 0000000..8fee22b --- /dev/null +++ b/hashes/README.md.hash @@ -0,0 +1 @@ +8za~|hBYAm6q9SC{b \ No newline at end of file diff --git a/hashes/chatgpt.com.csv.hash b/hashes/chatgpt.com.csv.hash new file mode 100644 index 0000000..6d6792d --- /dev/null +++ b/hashes/chatgpt.com.csv.hash @@ -0,0 +1 @@ +=`Zq&|%SsmμdBDh \ No newline at end of file diff --git a/hashes/db.csv.hash b/hashes/db.csv.hash new file mode 100644 index 0000000..1b361dc --- /dev/null +++ b/hashes/db.csv.hash @@ -0,0 +1 @@ +}v̵JR% \ No newline at end of file diff --git a/hashes/facebook.com.csv.hash b/hashes/facebook.com.csv.hash new file mode 100644 index 0000000..050ca0c --- /dev/null +++ b/hashes/facebook.com.csv.hash @@ -0,0 +1 @@ +qw?Q ClLrPF¯_*) \ No newline at end of file diff --git a/hashes/instagram.com.csv.hash b/hashes/instagram.com.csv.hash new file mode 100644 index 0000000..a5bf596 --- /dev/null +++ b/hashes/instagram.com.csv.hash @@ -0,0 +1 @@ +5셿>(H`O`lǮwS4 \ No newline at end of file diff --git a/hashes/meta.csv.hash b/hashes/meta.csv.hash new file mode 100644 index 0000000..d410c36 --- /dev/null +++ b/hashes/meta.csv.hash @@ -0,0 +1 @@ +j"m$L a=f\a"Bc \ No newline at end of file diff --git a/hashes/microsoft.csv.hash b/hashes/microsoft.csv.hash new file mode 100644 index 0000000..f30bcb3 --- /dev/null +++ b/hashes/microsoft.csv.hash @@ -0,0 +1 @@ +?hiIl%ѵN ŵ;{ \ No newline at end of file diff --git a/hashes/netflix.csv.hash b/hashes/netflix.csv.hash new file mode 100644 index 0000000..5970284 --- /dev/null +++ b/hashes/netflix.csv.hash @@ -0,0 +1 @@ +6IjQB D @^ \ No newline at end of file diff --git a/hashes/x.com.csv.hash b/hashes/x.com.csv.hash new file mode 100644 index 0000000..0c002b5 --- /dev/null +++ b/hashes/x.com.csv.hash @@ -0,0 +1 @@ +܋=|׽/M&T sa \ No newline at end of file diff --git a/hashes/youtube.com.csv.hash b/hashes/youtube.com.csv.hash new file mode 100644 index 0000000..b09ffce --- /dev/null +++ b/hashes/youtube.com.csv.hash @@ -0,0 +1 @@ +F-%Aw}bhqzᇛY| \ No newline at end of file diff --git a/in/known/netflix.com.csv b/in/known/netflix.csv similarity index 100% rename from in/known/netflix.com.csv rename to in/known/netflix.csv diff --git a/src/common.py b/src/common.py index e220761..d743db1 100644 --- a/src/common.py +++ b/src/common.py @@ -2,6 +2,7 @@ import json import colorama import datetime +from hashlib import sha256 DB_FILE = 'db.csv' @@ -120,4 +121,66 @@ def log_error(msg): msg (str): Error message """ if LOG_LEVEL >= LOG_LEVELS['ERROR']: - print(f'{colorama.Fore.RED}{datetime.datetime.now()} [ERROR] {msg}{colorama.Style.RESET_ALL}') \ No newline at end of file + print(f'{colorama.Fore.RED}{datetime.datetime.now()} [ERROR] {msg}{colorama.Style.RESET_ALL}') + + +def hash_file(filename): + """Hash file + + Args: + filename (str): Name of the file to hash + + Returns: + bytes: Hash of the file in bytes + """ + with open(filename, 'rb') as f: + return sha256(f.read()).digest() + + +def hash_str(string): + """_summary_ Hash string + + Args: + string (str): String to hash + + Returns: + str: Hash of the string + """ + return sha256(string.encode()).digest() + + +def save_hash_binary(new_hash_bytes, new_hash_filename): + """_summary_ Save hash of binary data to file + + Args: + data (bytes): Binary data to hash + filename (str): Name of the file to save hash + """ + with open(new_hash_filename, 'wb') as f: + f.write(new_hash_bytes) + log_info(f'Saved hash to {new_hash_filename}') + + +def read_file_binary(filename): + """_summary_ Read binary data from file + + Args: + filename (str): Name of the file to read data + + Returns: + bytes: Binary data read from file + """ + with open(filename, 'rb') as f: + return f.read() + +def check_hash_binary(new_hash_bytes, old_hash_filename): + """_summary_ Check hash of binary data + + Args: + data (bytes): Binary data to hash + filename (str): Name of the file to check hash + + Returns: + bool: Whether the hash matches + """ + return new_hash_bytes == read_file_binary(old_hash_filename) \ No newline at end of file diff --git a/src/make_amnezia.py b/src/make_amnezia.py index ce215a7..fd69927 100644 --- a/src/make_amnezia.py +++ b/src/make_amnezia.py @@ -1,8 +1,8 @@ from common import * - def make_amnezia(): log_info('make_amnezia: Starting') + data = read_csv(DB_FILE) # use ipv4 as hostname data['hostname'] = data['ipv4'] diff --git a/src/make_route_bat.py b/src/make_route_bat.py index 2337173..aa5233a 100644 --- a/src/make_route_bat.py +++ b/src/make_route_bat.py @@ -30,6 +30,7 @@ def make_route_bat(): log_info('make_route_bat: Starting') + data = read_csv(DB_FILE) # if ipv4 contains / then it is a masked ip range masked = data[data['ipv4'].str.contains('/')] diff --git a/src/make_wireguard.py b/src/make_wireguard.py index 7816ece..284b9f3 100644 --- a/src/make_wireguard.py +++ b/src/make_wireguard.py @@ -12,6 +12,7 @@ def make_wireguard(): log_info('make_wireguard: Starting') + data = read_csv(DB_FILE) file_str = "" # add header diff --git a/src/sort_db.py b/src/sort_db.py index 7937af5..a73d70a 100644 --- a/src/sort_db.py +++ b/src/sort_db.py @@ -34,13 +34,24 @@ def drop_duplicates(data): def sort_db(): log_info('sort_db: Starting') + # save_hash_binary(hash_file(DB_FILE), './hashes/' + DB_FILE + '.hash') + if os.path.exists('./hashes/' + DB_FILE + '.hash'): + if not check_hash_binary(hash_file(DB_FILE), './hashes/' + DB_FILE + '.hash'): + log_warning('Database file has been modified') + else: + log_info('Database file has not been modified') + return + else: + log_warning('No hash file found for database') data = read_csv(DB_FILE) data = data.sort_values(by=['hostname', 'ipv4', 'comment']) data = data.drop_duplicates(subset=['ipv4']) data = drop_duplicates(data) write_csv(data, DB_FILE) log_happy('Database sorted') + save_hash_binary(hash_file(DB_FILE), './hashes/' + DB_FILE) log_info('sort_db: Finished') + def drop_duplicates_in_known(data): @@ -52,24 +63,27 @@ def drop_duplicates_in_known(data): log_info(f"Initial CIDR data count: {len(cidr_data)}") log_info(f"Initial non-CIDR data count: {len(not_cidr_data)}") + if len(not_cidr_data) == 0: + log_info("No non-CIDR data found") + else: + cidr_ips = set() + for cidr in cidr_data.iloc[:, 0]: + try: + ip_network = ipaddress.ip_network(cidr, strict=False) + cidr_ips.update(str(ip) for ip in ip_network.hosts()) + except ValueError as e: + log_warning(f"Invalid CIDR notation {cidr}: {e}") + # remove invalid CIDR + cidr_data = cidr_data[~(cidr_data.iloc[:, 0].astype(str) == cidr)] + log_info(f'Dropped {cidr} because it is invalid CIDR notation') - cidr_ips = set() - for cidr in cidr_data.iloc[:, 0]: - try: - ip_network = ipaddress.ip_network(cidr, strict=False) - cidr_ips.update(str(ip) for ip in ip_network.hosts()) - except ValueError as e: - log_warning(f"Invalid CIDR notation {cidr}: {e}") - # remove invalid CIDR - cidr_data = cidr_data[~(cidr_data.iloc[:, 0].astype(str) == cidr)] - log_info(f'Dropped {cidr} because it is invalid CIDR notation') + not_cidr_data = not_cidr_data[~not_cidr_data.iloc[:, 0].astype(str).isin(cidr_ips)] - not_cidr_data = not_cidr_data[~not_cidr_data.iloc[:, 0].astype(str).isin(cidr_ips)] + dropped_ips = set(original_data.iloc[:, 0].astype(str)) - set(not_cidr_data.iloc[:, 0].astype(str)) - set(cidr_data.iloc[:, 0].astype(str)) + for ip in dropped_ips: + log_info(f'Dropped {ip} because it is included in a CIDR range') - dropped_ips = set(original_data.iloc[:, 0].astype(str)) - set(not_cidr_data.iloc[:, 0].astype(str)) - set(cidr_data.iloc[:, 0].astype(str)) - for ip in dropped_ips: - log_info(f'Dropped {ip} because it is included in a CIDR range') data = pd.concat([not_cidr_data, cidr_data], ignore_index=True) if len(original_data) != len(data): @@ -86,9 +100,18 @@ def sort_known(): fname = 'fake_name' for file in onlyfiles: log_info(f"Processing file: {file}") + if os.path.exists(f'./hashes/{file}.hash'): + if not check_hash_binary(hash_file(f'in/known/{file}'), f'./hashes/{file}.hash'): + log_warning(f'{file} has been modified') + else: + log_info(f'{file} has not been modified') + continue + else: + log_warning(f'No hash file found for {file}') + data = read_txt_lbl(f'in/known/{file}') # add first line with column name - data = [fname] + data + # data = [fname] + data # print(data) # exit(0) @@ -101,6 +124,7 @@ def sort_known(): # drop all line with fname data = data[data[data.columns[0]] != fname] write_txt(data.iloc[:, 0].tolist(), f'in/known/{file}') + save_hash_binary(hash_file(f'in/known/{file}'), f'./hashes/{file}.hash') log_happy(f'{file} sorted') log_info("sort_known: Finished") diff --git a/src/sort_readme.py b/src/sort_readme.py index 4fc1381..2a3ed1d 100644 --- a/src/sort_readme.py +++ b/src/sort_readme.py @@ -1,11 +1,20 @@ from common import * import re +import os # catch
and
in README.md # sort the lines between them alphabetically def sort_readme(): log_info('sort_readme: Starting') + if os.path.exists('./hashes/README.md.hash'): + if not check_hash_binary(hash_file('README.md'), './hashes/README.md.hash'): + log_warning('README.md has been modified') + else: + log_info('README.md has not been modified') + return + else: + log_warning('No hash file found for README.md') with open('README.md', 'r') as f: readme = f.readlines() @@ -32,6 +41,7 @@ def sort_readme(): f.writelines(sorted_readme) log_happy('README.md sorted') + save_hash_binary(hash_file('README.md'), './hashes/README.md.hash') log_info('sort_readme: Finished')