Skip to content

Commit

Permalink
Merge pull request #48 from kozalosev/feature/url-cleaners
Browse files Browse the repository at this point in the history
Add URLCleaner and InstaFix processors
  • Loading branch information
Leonid Kozarin authored Oct 7, 2023
2 parents d5b1a7d + a3e0098 commit a7554c4
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 5 deletions.
9 changes: 9 additions & 0 deletions app/localizations.ini
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@ hint_hexadecimal_decoder = Plain text
hint_base64_encoder = Encrypted text
help_base64_encoder = Converts some text to a hexadecimal string. For example, the word `hello` will be transformed into `aGVsbG8=`.
hint_base64_decoder = Unencrypted text
hint_url_encoder = URLEncoder
help_url_encoder = Encodes URL addresses containing non-Latin characters exactly like Chrome does, and vice versa. The address must start with `http://` or `https://`.
hint_url_cleaner = URL cleaner
help_url_cleaner = Removes UTM and SI labels, text highlighting from the URL.
hint_insta_fix = InstaFix
help_insta_fix = Converts a link to serve a fixed Instagram image and video embed.
hint_calculator = Substitutions calculator
help_calculator = Evaluates expressions like `Total = {{ 2*2 + 2 $ > RUB }}` into something like `Total = 448.77 RUB`
hint_single_expression_calculator = Calculator
Expand Down Expand Up @@ -81,6 +86,10 @@ hint_base64_encoder = Шифровка
help_base64_encoder = Кодирует текст в base64-строку. Например, слово `привет` первратится в `0L/RgNC40LLQtdGC`.
hint_base64_decoder = Дешифровка
help_url_encoder = Кодирует URL-адрес с нелатинскими символами так же, как это делает Chrome. Возможно обратное декодирование. Адрес должен начинаться с `http://` или `https://`.
hint_url_cleaner = Очиститель URL
help_url_cleaner = Удаляет UTM- и SI-метки, подсветку текста из адреса.
hint_insta_fix = InstaFix
help_insta_fix = Преобразует ссылку, чтобы получить встраиваемое представление фото или видео.
hint_calculator = Подстановочный калькулятор
help_calculator = Вычисляет выражения типа `сумма = {{ 2*2 + 2 $ > RUB }}` во что-то вроде `сумма = 448.77 RUB`
hint_single_expression_calculator = Калькулятор
Expand Down
2 changes: 0 additions & 2 deletions app/strconv/calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ def next_expr(self, query: str, lang_code: str, sb: StringIO, pos: int = 0, guar
expr = "1"
expr = expr.strip().replace(",", ".")
val = _eval_func(expr)
if not isinstance(val, (int, float)):
self._logger.warning(f"'{val}' is not a number for some reason. The query was '{query}', expr: '{expr}'")
from_curr = match.group("from_curr")
to_curr = match.group("to_curr")
if from_curr:
Expand Down
55 changes: 53 additions & 2 deletions app/strconv/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@
import re
from abc import ABC
from typing import Collection
from urllib.parse import quote, unquote, urlparse, urlunparse
from urllib.parse import quote, unquote, urlparse, urlunparse, parse_qs, ParseResult

from txtproc.abc import PrefixedTextProcessor, Reversible


_re_url_encoded_char = re.compile("%[0-9]{2}")


Expand Down Expand Up @@ -79,3 +78,55 @@ def do_transform(cls, text: str) -> str:
def _process_netloc(domains: str) -> str:
decoded_domains = [domain.encode('utf-8').decode('idna') for domain in domains.split('.')]
return '.'.join(decoded_domains)


class URLCleaner(URLPrefixedTextProcessor):
def transform(self, text: str) -> str:
url = urlparse(text)
url = self._get_rid_of_utm_labels(url)
url = self._get_rid_of_text_highlighting(url)
return urlunparse(url)

@classmethod
def text_filter(cls, text: str) -> bool:
return any(x in text for x in ("utm_", "si", "#:~:text="))

@staticmethod
def _get_rid_of_utm_labels(url: ParseResult) -> ParseResult:
query = [f"{k}={v}" for k, v in parse_qs(url.query).items() if not (k.startswith("utm_") or k == "si")]
return url._replace(query='&'.join(query))

@staticmethod
def _get_rid_of_text_highlighting(url: ParseResult) -> ParseResult:
if url.fragment.startswith(":~:text="):
return url._replace(fragment="")
else:
return url


class InstaFix(URLPrefixedTextProcessor):
def transform(self, text: str) -> str:
url = urlparse(text)
url = self._get_rid_of_igshid(url)
url = self._add_dd(url)
return urlunparse(url)

@classmethod
def text_filter(cls, text: str) -> bool:
return any(x in text for x in ("instagram.com", "igshid"))

@staticmethod
def _get_rid_of_igshid(url: ParseResult) -> ParseResult:
query = parse_qs(url.query)
del query['igshid']
query = [f"{k}={v}" for k, v in query.items()]
return url._replace(query='&'.join(query))

@staticmethod
def _add_dd(url: ParseResult) -> ParseResult:
domain = url.netloc
if domain.startswith("www."):
domain = domain[4:]
if not domain.startswith("dd"):
domain = "dd" + domain
return url._replace(netloc=domain)
32 changes: 31 additions & 1 deletion tests/test_strconv/test_url.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from strconv.url import URLEncoder, URLDecoder
from strconv.url import URLEncoder, URLDecoder, URLCleaner, InstaFix

url = "http://сайт.рф/путь;параметр=значение?запрос1=значение1&запрос2=значение2#хэш"
encoded_url = "http://xn--80aswg.xn--p1ai/%D0%BF%D1%83%D1%82%D1%8C;%D0%BF%D0%B0%D1%80%D0%B0%D0%BC%D0%B5%D1%82%D1%80=%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B5?%D0%B7%D0%B0%D0%BF%D1%80%D0%BE%D1%811=%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B51&%D0%B7%D0%B0%D0%BF%D1%80%D0%BE%D1%812=%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D0%B52#%D1%85%D1%8D%D1%88"
Expand Down Expand Up @@ -33,3 +33,33 @@ def test_process(self):
domain_and_path = "http://сайт.рф/путь"
assert self.decoder.process("http://сайт.рф/%D0%BF%D1%83%D1%82%D1%8C") == domain_and_path
assert self.decoder.process("http://xn--80aswg.xn--p1ai/путь") == domain_and_path


class TestURLCleaner:
cleaner = URLCleaner()

def test_can_process(self):
assert self.cleaner.can_process("https://test.domain/?utm_id=utm&si=123456")
assert self.cleaner.can_process("https://qwerty:123456@test.domain/?utm_id=utm&si=123456")
assert not self.cleaner.can_process("https://test.domain/")
assert not self.cleaner.can_process("test.domain/?utm=utm&si=123456")

def test_process(self):
assert self.cleaner.process("https://test.domain/?utm_id=utm&si=123456") \
== "https://test.domain/"
assert self.cleaner.process("https://qwerty:123456@test.domain/?utm_id=utm&si=123456") \
== "https://qwerty:123456@test.domain/"


class TestInstaFix:
reels_url = "https://www.instagram.com/reel/CvkEuXdNKHC/?igshid=NTc4MTIwNjQ2YQ=="
result_url = "https://ddinstagram.com/reel/CvkEuXdNKHC/"

instafix = InstaFix()

def test_can_process(self):
assert self.instafix.can_process(self.reels_url)
assert not self.instafix.can_process("https://test.domain/")

def test_process(self):
assert self.instafix.process(self.reels_url) == self.result_url

0 comments on commit a7554c4

Please sign in to comment.