Skip to content

Commit

Permalink
add in transforms pipe
Browse files Browse the repository at this point in the history
  • Loading branch information
ericherman committed Jun 1, 2023
1 parent 62fd4a8 commit 69af457
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 14 deletions.
4 changes: 4 additions & 0 deletions url-check-config.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,9 @@
"^http[s]\\?://twitter\\.com": "302; does not serve scripts",
"^http[s]\\?://github\\.com/.*/edit/": "may point to yet-to-exist page",
"^http[s]\\?://docs\\.github\\.com/": "seems blocked as DoS protection"
},
"transforms" : {
"sed 's@^\\(https://example.org/.*\\)[\\.,)]$@\\1@'":
"remove trailing punctuation from example.org links"
}
}
47 changes: 36 additions & 11 deletions url-check.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def files_from_repo(repos_basedir, repo_name, repo_url, branch, ctx=None):


def urls_from(workdir, file, user_ignore_patterns=[], ctx=None):
found = []
# pull URLs out of the file, including option leading paren
cmd_str = f"grep --extended-regexp --only-matching --text \
'[\\(]?(http|https)://[-a-zA-Z0-9\./\\?=_%:\\(\\)]*' \
Expand All @@ -136,12 +135,14 @@ def urls_from(workdir, file, user_ignore_patterns=[], ctx=None):
for pattern in ignore_patterns:
cmd_str += f" | grep --invert-match '{pattern}'"

urls = shell_slurp(cmd_str, workdir, ctx).splitlines()
for url in urls:
lines = shell_slurp(cmd_str, workdir, ctx).splitlines()
urls = []
for line in lines:
# ignore 'binary file matches' messages, only grab URLs
if url.startswith("http"):
found += [url]
return found
if line.startswith("http"):
urls += [line]

return urls


def clear_previous_used(checks, name):
Expand All @@ -151,9 +152,28 @@ def clear_previous_used(checks, name):
checks[url]["used"][name] = []


def set_used_for_file(checks, gits_dir, name, file, ignore_patterns, ctx):
def transform_urls(transforms, urls, ctx):
urls_str = "\n".join(urls)

cmd = f"echo '{urls_str}'"
for transform in transforms:
cmd = cmd + f" | {transform}"

urls_str = shell_slurp(cmd, ".", ctx)

transformed_urls = []
for line in urls_str.splitlines():
if line.startswith("http"):
transformed_urls += [line]

return transformed_urls


def set_used_for_file(
checks, gits_dir, name, file, ignore_patterns, transforms, ctx):
repo_dir = os.path.join(gits_dir, name)
urls = urls_from(repo_dir, file, ignore_patterns, ctx)
urls = transform_urls(transforms, urls, ctx)
for url in urls:
if url not in checks.keys():
checks[url] = {}
Expand All @@ -165,10 +185,11 @@ def set_used_for_file(checks, gits_dir, name, file, ignore_patterns, ctx):
checks[url]["used"][name] += [file]


def set_used(checks, gits_dir, name, files, ignore_patterns, ctx):
def set_used(checks, gits_dir, name, files, ignore_patterns, transforms, ctx):
clear_previous_used(checks, name)
for file in files:
set_used_for_file(checks, gits_dir, name, file, ignore_patterns, ctx)
set_used_for_file(checks, gits_dir, name, file, ignore_patterns, transforms,
ctx)


def remove_unused(checks):
Expand Down Expand Up @@ -291,6 +312,7 @@ def url_check_all(gits_dir,
repos_files,
timeout,
ignore_patterns=[],
transforms=[],
ctx=None):

for url in checks.keys():
Expand All @@ -299,7 +321,8 @@ def url_check_all(gits_dir,
for repo_name, files in repos_files.items():
ctx.log(repo_name, "contains", len(files), "files")
ctx.debug(files)
set_used(checks, gits_dir, repo_name, files, ignore_patterns, ctx)
set_used(checks, gits_dir, repo_name, files, ignore_patterns, transforms,
ctx)

ctx.debug("checks length:", len(checks), "before unused removed")
checks = remove_unused(checks)
Expand Down Expand Up @@ -387,12 +410,14 @@ def main(sys_argv=sys.argv, ctx=default_context()):
repos_info = config_obj["repositories"]
ignore_patterns_map = config_obj.get("ignore_patterns", {})
add_ignore_patterns = ignore_patterns_map.keys()
transforms_map = config_obj.get("transforms", {})
transforms = transforms_map.keys()

repos_files = read_repos_files(gits_dir, repos_info, ctx)

orig_checks = read_json(checks_path)
checks = url_check_all(gits_dir, orig_checks, repos_files, timeout,
add_ignore_patterns, ctx)
add_ignore_patterns, transforms, ctx)

write_json(checks_path, checks)
condensed = condense_results(checks, repos_info.keys())
Expand Down
35 changes: 32 additions & 3 deletions url-check.test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def debug(self, *args, **kwargs):
return self.log(args, kwargs)


class TestSum(unittest.TestCase):
class Test_url_check(unittest.TestCase):

def test_system_context(self):
ctx = uc.System_Context()
Expand Down Expand Up @@ -118,7 +118,8 @@ def test_set_used(self):
config = uc.read_json('url-check-config.json')

ignore = config.get("ignore_patterns").keys()
uc.set_used_for_file(checks, gits_dir, name, file, ignore, ctx)
transforms = []
uc.set_used_for_file(checks, gits_dir, name, file, ignore, transforms, ctx)

self.assertNotIn("https://twitter.com", checks)

Expand Down Expand Up @@ -186,6 +187,32 @@ def test_read_repos_files(self):
repo_files = uc.read_repos_files(gits_dir, repos, ctx)
self.assertIn("url-check.test.py", repo_files[repo_name])

def test_transform_urls(self):
ctx = Test_Context()
ctx.capture = True
ctx.verbose = True
transforms = []
urls = [
'https://example.org/one.html',
'https://example.org/obsolete.html',
'https://example.org/three.html',
]
transformed = uc.transform_urls(transforms, urls, ctx)
self.assertEqual(transformed, urls)

transforms = [
f"sed 's@obsolete\\.html@foo.html)@g'",
f"sed 's@foo@two@g'",
f"sed 's@\\(example.org/.*\\)[\\.,)]$@\\1@g'",
]
expected_urls = [
'https://example.org/one.html',
'https://example.org/two.html',
'https://example.org/three.html',
]
transformed = uc.transform_urls(transforms, urls, ctx)
self.assertEqual(transformed, expected_urls, ctx.out)

def test_remove_unused(self):
url3 = "https://example.org/three.html"
checks = {
Expand Down Expand Up @@ -331,9 +358,11 @@ def test_url_check_all(self):
}
}
add_ignore = []
transforms = []
ctx = Test_Context()
timeout = 2
checks = uc.url_check_all('.', checks, repos_files, timeout, add_ignore,
Test_Context())
transforms, ctx)
self.maxDiff = None
self.assertEqual(checks, expected)

Expand Down

0 comments on commit 69af457

Please sign in to comment.