Skip to content

Commit

Permalink
Download worker refactor (#288)
Browse files Browse the repository at this point in the history
* ClippingSubsampler rewrite and bug fixes

* More refactoring of ClippingSubsampler, plus a fix to _get_clip_intervals

* Finished refactoring ClippingSubsampler

* Final code changes

* Added docstrings

* Passed tests and linting

* Made type annotations consistent with Python 3.8

* More annotation fixes

* The Python 3.8 annotation needs a lot of hand-holding, it seems

* Pylint has to cut it out, I swear to God

* No real change, just relauching unit tests which failed due to connection timeouts

* Linting issue

* Another linting issue

* Separated per-shard code from code that should only be executed once

* Pulled ShardStatus parameters into their own data type

* Cleaned up shard processing error handling

* Cleaned up code

* Bug fixes

* Formatting

* Fixed linting issues

* Fixing more damn linting

* Added a missing docstring

* Unified SubsetWorker and DownloadWorker code

* Bug fixes

* Linting

* Linting again

* Forgot a docstring

* Removed unnecessary manual thread handling

* Removed unused import

---------

Co-authored-by: iejMac <kilianmaciej6@gmail.com>
Co-authored-by: Romain Beaumont <romain.rom1@gmail.com>
  • Loading branch information
rom1504 and iejMac committed Feb 8, 2024
0 parents commit f14cfc5
Show file tree
Hide file tree
Showing 93 changed files with 9,651 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "daily"
56 changes: 56 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: Continuous integration

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Install
run: |
sudo apt-get update
python3 -m venv .env
source .env/bin/activate
python -m pip install -U pip
make install-dev
sudo apt-get install -y ffmpeg
- name: Lint
run: |
source .env/bin/activate
make lint
tests:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]

steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install
run: |
sudo apt-get update
python3 -m venv .env
source .env/bin/activate
make install
make install-dev
sudo apt-get install -y ffmpeg
- name: Unit tests
run: |
source .env/bin/activate
make test
37 changes: 37 additions & 0 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Release

on:
push:
branches:
- main
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-ecosystem/action-regex-match@v2
id: regex-match
with:
text: ${{ github.event.head_commit.message }}
regex: '^Release ([^ ]+)'
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel twine
- name: Release
if: ${{ steps.regex-match.outputs.match != '' }}
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ steps.regex-match.outputs.group1 }}
- name: Build and publish
if: ${{ steps.regex-match.outputs.match != '' }}
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
python setup.py sdist bdist_wheel
twine upload dist/*
13 changes: 13 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
*.egg-info
.vscode
.env
__pycache__
.envtest
.coverage*
.env*
wandb
*.pex
.pexing
**/dataset/*
dist/
build/
268 changes: 268 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
[MASTER]

# Specify a configuration file.
#rcfile=

# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=

# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS

# Pickle collected data for later comparisons.
persistent=yes

# List of plugins (as comma separated values of python modules names) to load,
# usually to register additional checkers.
load-plugins=


[MESSAGES CONTROL]

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time. See also the "--disable" option for examples.
enable=indexing-exception,old-raise-syntax

# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once).You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use"--disable=all --enable=classes
# --disable=W"
disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,no-else-return,wrong-import-order,unnecessary-pass,logging-fstring-interpolation,logging-format-interpolation,C0330


[REPORTS]

# Set the output format. Available formats are text, parseable, colorized, msvs
# (visual studio) and html. You can also give a reporter class, eg
# mypackage.mymodule.MyReporterClass.
output-format=text

# Tells whether to display a full report or only the messages
reports=no

# Python expression which should return a note less than 10 (10 is the highest
# note). You have access to the variables errors warning, statement which
# respectively contain the number of errors / warnings messages and the total
# number of statements analyzed. This is used by the global evaluation report
# (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)

# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details
#msg-template=


[TYPECHECK]

# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes

# List of classes names for which member attributes should not be checked
# (useful for classes with attributes dynamically set).
ignored-classes=SQLObject

# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E0201 when accessed. Python regular
# expressions are accepted.
generated-members=REQUEST,acl_users,aq_parent

# List of decorators that create context managers from functions, such as
# contextlib.contextmanager.
contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager


[VARIABLES]

# Tells whether we should check for unused import in __init__ files.
init-import=no

# A regular expression matching the beginning of the name of dummy variables
# (i.e. not used).
dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)

# List of additional names supposed to be defined in builtins. Remember that
# you should avoid to define new builtins when possible.
additional-builtins=


[BASIC]

# Regular expression which should only match correct module names
module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$

# Regular expression which should only match correct module level names
const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$

# Regular expression which should only match correct class names
class-rgx=^_?[A-Z][a-zA-Z0-9]*$

# Regular expression which should only match correct function names
function-rgx=^(?:(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$

# Regular expression which should only match correct method names
method-rgx=^(?:(?P<exempt>__[a-z0-9_]+__|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$

# Regular expression which should only match correct instance attribute names
attr-rgx=^_{0,2}[a-z][a-z0-9_]*$

# Regular expression which should only match correct argument names
argument-rgx=^[a-z][a-z0-9_]*$

# Regular expression which should only match correct variable names
variable-rgx=^[a-z][a-z0-9_]*$

# Regular expression which should only match correct attribute names in class
# bodies
class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$

# Regular expression which should only match correct list comprehension /
# generator expression variable names
inlinevar-rgx=^[a-z][a-z0-9_]*$

# Good variable names which should always be accepted, separated by a comma
good-names=main,_

# Bad variable names which should always be refused, separated by a comma
bad-names=

# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=(__.*__|main)

# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=10


[FORMAT]

# Maximum number of characters on a single line.
max-line-length=120

# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=(?x)
(^\s*(import|from)\s
|\$Id:\s\/\/depot\/.+#\d+\s\$
|^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*("[^"]\S+"|'[^']\S+')
|^\s*\#\ LINT\.ThenChange
|^[^#]*\#\ type:\ [a-zA-Z_][a-zA-Z0-9_.,[\] ]*$
|pylint
|"""
|\#
|lambda
|(https?|ftp):)

# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=y

# Maximum number of lines in a module
max-module-lines=99999

# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '


[SIMILARITIES]

# Minimum lines number of a similarity.
min-similarity-lines=4

# Ignore comments when computing similarities.
ignore-comments=yes

# Ignore docstrings when computing similarities.
ignore-docstrings=yes

# Ignore imports when computing similarities.
ignore-imports=no


[MISCELLANEOUS]

# List of note tags to take in consideration, separated by a comma.
notes=


[IMPORTS]

# Deprecated modules which should not be used, separated by a comma
deprecated-modules=regsub,TERMIOS,Bastion,rexec,sets

# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled)
import-graph=

# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled)
ext-import-graph=

# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled)
int-import-graph=

extension-pkg-whitelist=_jsonnet


[CLASSES]

# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,__new__,setUp

# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls,class_

# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs


[DESIGN]

# Maximum number of arguments for function / method
max-args=5

# Argument names that match this expression will be ignored. Default to name
# with leading underscore
ignored-argument-names=_.*

# Maximum number of locals for function / method body
max-locals=15

# Maximum number of return / yield for function / method body
max-returns=6

# Maximum number of branch for function / method body
max-branches=12

# Maximum number of statements in function / method body
max-statements=50

# Maximum number of parents for a class (see R0901).
max-parents=7

# Maximum number of attributes for a class (see R0902).
max-attributes=7

# Minimum number of public methods for a class (see R0903).
min-public-methods=2

# Maximum number of public methods for a class (see R0904).
max-public-methods=20



[TOKENS]

# Number of spaces of indent required when the last token on the preceding line
# is an open (, [, or {.
indent-after-paren=4
Loading

0 comments on commit f14cfc5

Please sign in to comment.