Skip to content

Commit

Permalink
Improve deduplicating.
Browse files Browse the repository at this point in the history
  • Loading branch information
xkww3n committed Oct 15, 2023
1 parent 439be05 commit ccf6aa6
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 16 deletions.
19 changes: 19 additions & 0 deletions Utils/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ def __hash__(self):
def __eq__(self, other):
return self.Type == other.Type and self.Payload == other.Payload

def includes(self, other):
return ("." + other.Payload if other.Type == "DomainSuffix" else other.Payload).endswith(
"." + self.Payload if self.Type == "DomainSuffix" else self.Payload)


def custom_convert(src: Path) -> set:
src_custom = open(src, mode="r", encoding="utf-8").read().splitlines()
Expand Down Expand Up @@ -204,3 +208,18 @@ def apply_patch(src: set, name: str) -> set:
logging.warning(f"Not found: {rule}")
logging.info(f'Patch "{name + ".txt"}" applied.')
return src


def dedup(src: set):
list_length_sorted = [item for item in src]
list_length_sorted.sort(key=lambda item: len(str(item)))
set_unique = set()
for item in list_length_sorted:
flag_unique = True
for added in set_unique:
if added.includes(item):
flag_unique = False
logging.debug(f"{item} is removed as duplicated with {added}.")
if flag_unique:
set_unique.add(item)
return set_unique
23 changes: 7 additions & 16 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,15 @@

set_exclusions = set()
logger.debug("Start deduplicating reject and exclude set.")
set_rejections = rule.dedup(set_rejections)
for domain_exclude in set_exclusions_raw.copy():
for domain_reject in set_rejections.copy():
if (domain_reject.Payload == domain_exclude.Payload and domain_reject.Type == domain_exclude.Type) \
or (domain_reject.Payload == domain_exclude.Payload and
domain_reject.Type == "DomainFull" and domain_exclude.Type == "DomainSuffix"):
set_rejections.remove(domain_reject)
set_exclusions_raw.remove(domain_exclude)
logger.debug(f"{domain_reject} is removed as duplicated with {domain_exclude}.")
logger.debug(f"{domain_reject} is removed as excluded by {domain_exclude}.")

for domain_exclude in set_exclusions_raw:
for domain_reject in set_rejections:
Expand All @@ -96,27 +97,17 @@
logger.info("Start generating domestic rules.")
START_TIME = time_ns()

src_domestic_raw = set(open(const.PATH_SOURCE_V2FLY/"geolocation-cn", mode="r", encoding="utf-8").read().splitlines())
set_domestic_raw = geosite.parse(src_domestic_raw, None, ["!cn"])
logger.info(f"Imported {len(set_domestic_raw)} domestic rules from v2fly geolocation-cn list.")
src_domestic = set(open(const.PATH_SOURCE_V2FLY/"geolocation-cn", mode="r", encoding="utf-8").read().splitlines())
set_domestic = geosite.parse(src_domestic, None, ["!cn"])
logger.info(f"Imported {len(set_domestic)} domestic rules from v2fly geolocation-cn list.")

# Add all domestic TLDs to domestic rules, then remove domestic domains with domestic TLDs.
# Add all domestic TLDs to domestic rules, then perform deduplication.
src_domestic_tlds = set(open(const.PATH_SOURCE_V2FLY/"tld-cn", mode="r", encoding="utf-8").read().splitlines())
set_domestic_tlds = geosite.parse(src_domestic_tlds)
logger.info(f"Imported {len(set_domestic_tlds)} domestic TLDs.")
set_domestic = set()
for domain in set_domestic_raw:
is_domestic = False
for tld in set_domestic_tlds:
if domain.Payload.endswith(tld.Payload):
logger.debug(f'"{domain.Payload}"" is removed for having a domestic TLD "{tld.Payload}"".')
is_domestic = True
break
if not is_domestic:
set_domestic.add(domain)
logger.info(f"Removed {len(set_domestic_raw) - len(set_domestic)} domestic domains having domestic TLD.")
set_domestic |= set_domestic_tlds
set_domestic = rule.apply_patch(set_domestic, "domestic")
set_domestic = rule.dedup(set_domestic)
logger.info(f"Generated {len(set_domestic)} domestic rules.")

list_domestic_sorted = rule.set_to_sorted_list(set_domestic)
Expand Down

0 comments on commit ccf6aa6

Please sign in to comment.