diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea27a584..4ecfbfe3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,6 +2,7 @@ "name": "nfcore", "image": "nfcore/gitpod:latest", "remoteUser": "gitpod", + "runArgs": ["--privileged"], // Configure tool-specific properties. "customizations": { diff --git a/.editorconfig b/.editorconfig index b6b31907..9b990088 100644 --- a/.editorconfig +++ b/.editorconfig @@ -22,3 +22,11 @@ indent_size = unset [/assets/email*] indent_size = unset + +# ignore Readme +[README.md] +indent_style = unset + +# ignore python +[*.{py}] +indent_style = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 12d0b7ac..a3ebc30c 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -9,7 +9,9 @@ Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) -> If you need help using or modifying nf-core/circdna then the best place to ask is on the nf-core Slack [#circdna](https://nfcore.slack.com/channels/circdna) channel ([join our Slack here](https://nf-co.re/join/slack)). +:::info +If you need help using or modifying nf-core/circdna then the best place to ask is on the nf-core Slack [#circdna](https://nfcore.slack.com/channels/circdna) channel ([join our Slack here](https://nf-co.re/join/slack)). +::: ## Contribution workflow @@ -25,6 +27,9 @@ If you're not used to this workflow with git, you can start with some [docs from ## Tests +You can optionally test your changes by running the pipeline locally. Then it is recommended to use the `debug` profile to +receive warnings about process selectors and other debug info. Example: `nextflow run . -profile debug,test,docker --outdir `. + When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. @@ -116,4 +121,3 @@ To get started: Devcontainer specs: - [DevContainer config](.devcontainer/devcontainer.json) -- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index dc1c8693..078e735f 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -42,7 +42,7 @@ body: attributes: label: System information description: | - * Nextflow version _(eg. 22.10.1)_ + * Nextflow version _(eg. 23.04.0)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6ef92dc7..7e1699b3 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -19,6 +19,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/circ - [ ] If necessary, also make a PR on the nf-core/circdna _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). +- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index bdf169ee..7359661e 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,20 +14,26 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v3 + uses: seqeralabs/action-tower-launch@v2 + # TODO nf-core: You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/circdna/work-${{ github.sha }} parameters: | { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/circdna/results-${{ github.sha }}" } - profiles: test_full,aws_tower - - uses: actions/upload-artifact@v3 + profiles: test_full + + - uses: actions/upload-artifact@v4 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 3cae2f04..0225b8ca 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -12,18 +12,22 @@ jobs: steps: # Launch workflow using Tower CLI tool action - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v1 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/circdna/work-${{ github.sha }} parameters: | { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/circdna/results-test-${{ github.sha }}" } - profiles: test,aws_tower - - uses: actions/upload-artifact@v3 + profiles: test + + - uses: actions/upload-artifact@v4 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index e0ab8c77..8095337a 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -19,7 +19,7 @@ jobs: # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets - name: Post PR comment if: failure() - uses: mshick/add-pr-comment@v1 + uses: mshick/add-pr-comment@v2 with: message: | ## This PR is against the `master` branch :x: diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml index f9206fd2..7d24ad2c 100644 --- a/.github/workflows/build-docker-image.yml +++ b/.github/workflows/build-docker-image.yml @@ -24,4 +24,4 @@ jobs: with: file: modules/local/ampliconsuite/Dockerfile push: true - tags: "quay.io/nf-core/prepareaa:latest" + tags: "quay.io/nf-core/prepareaa:1.0.5" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c5425869..a42749e9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,11 +24,11 @@ jobs: strategy: matrix: NXF_VER: - - "22.10.1" + - "23.04.0" - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 @@ -36,75 +36,27 @@ jobs: version: "${{ matrix.NXF_VER }}" - name: Run pipeline with test data - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results - - test_keep_duplicates: - name: Run pipeline with test data, but remove marked duplicates + test_AA: + name: Run pipeline with test data and ensure AmpliconSuite Installation # Only run on push if this is the nf-core dev branch (merged PRs) if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" runs-on: ubuntu-latest strategy: matrix: NXF_VER: - - "22.10.1" + - "23.04.0" - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 with: version: "${{ matrix.NXF_VER }}" - - name: Run pipeline with test data, but remove marked duplicates - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --keep_duplicates false - - test_skip_markduplicates: - name: Run pipeline with test data, but remove marked duplicates - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "22.10.1" - - "latest-everything" - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 - with: - version: "${{ matrix.NXF_VER }}" - - name: Run pipeline with test data, but remove marked duplicates - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --skip_markduplicates - - ampliconarchitect: - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" - runs-on: ubuntu-latest - name: - Run pipeline with test_AA to test functionality of AmpliconArchitect - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run pipeline with AmpliconArchitect + - name: Run pipeline with test data run: | nextflow run ${GITHUB_WORKSPACE} -profile test_AA,docker --outdir ./results diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml index 694e90ec..e37cfda5 100644 --- a/.github/workflows/clean-up.yml +++ b/.github/workflows/clean-up.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@v7 + - uses: actions/stale@v9 with: stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml new file mode 100644 index 00000000..8a330045 --- /dev/null +++ b/.github/workflows/download_pipeline.yml @@ -0,0 +1,67 @@ +name: Test successful pipeline download with 'nf-core download' + +# Run the workflow when: +# - dispatched manually +# - when a PR is opened or reopened to master branch +# - the head branch of the pull request is updated, i.e. if fixes for a release are pushed last minute to dev. +on: + workflow_dispatch: + pull_request: + types: + - opened + branches: + - master + pull_request_target: + branches: + - master + +env: + NXF_ANSI_LOG: false + +jobs: + download: + runs-on: ubuntu-latest + steps: + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + architecture: "x64" + - uses: eWaterCycle/setup-singularity@v7 + with: + singularity-version: 3.8.3 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install git+https://github.com/nf-core/tools.git@dev + + - name: Get the repository name and current branch set as environment variable + run: | + echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} + echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> ${GITHUB_ENV} + echo "REPO_BRANCH=${GITHUB_REF#refs/heads/}" >> ${GITHUB_ENV} + + - name: Download the pipeline + env: + NXF_SINGULARITY_CACHEDIR: ./ + run: | + nf-core download ${{ env.REPO_LOWERCASE }} \ + --revision ${{ env.REPO_BRANCH }} \ + --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ + --compress "none" \ + --container-system 'singularity' \ + --container-library "quay.io" -l "docker.io" -l "ghcr.io" \ + --container-cache-utilisation 'amend' \ + --download-configuration + + - name: Inspect download + run: tree ./${{ env.REPOTITLE_LOWERCASE }} + + - name: Run the downloaded pipeline + env: + NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_HOME_MOUNT: true + run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index 8de169a4..0bf5f8a7 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -4,7 +4,7 @@ on: types: [created] jobs: - deploy: + fix-linting: # Only run if comment is on a PR with the main repo, and if it contains the magic keywords if: > contains(github.event.comment.html_url, '/pull/') && @@ -13,10 +13,17 @@ jobs: runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@v3 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 with: token: ${{ secrets.nf_core_bot_auth_token }} + # indication that the linting is being fixed + - name: React on comment + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + comment-id: ${{ github.event.comment.id }} + reactions: eyes + # Action runs on the issue comment, so we don't get the PR by default # Use the gh cli to check out the PR - name: Checkout Pull Request @@ -24,32 +31,59 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} - - uses: actions/setup-node@v3 + # Install and run pre-commit + - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + with: + python-version: 3.11 - - name: Install Prettier - run: npm install -g prettier @prettier/plugin-php + - name: Install pre-commit + run: pip install pre-commit - # Check that we actually need to fix something - - name: Run 'prettier --check' - id: prettier_status - run: | - if prettier --check ${GITHUB_WORKSPACE}; then - echo "result=pass" >> $GITHUB_OUTPUT - else - echo "result=fail" >> $GITHUB_OUTPUT - fi + - name: Run pre-commit + id: pre-commit + run: pre-commit run --all-files + continue-on-error: true - - name: Run 'prettier --write' - if: steps.prettier_status.outputs.result == 'fail' - run: prettier --write ${GITHUB_WORKSPACE} + # indication that the linting has finished + - name: react if linting finished succesfully + if: steps.pre-commit.outcome == 'success' + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + comment-id: ${{ github.event.comment.id }} + reactions: "+1" - name: Commit & push changes - if: steps.prettier_status.outputs.result == 'fail' + id: commit-and-push + if: steps.pre-commit.outcome == 'failure' run: | git config user.email "core@nf-co.re" git config user.name "nf-core-bot" git config push.default upstream git add . git status - git commit -m "[automated] Fix linting with Prettier" + git commit -m "[automated] Fix code linting" git push + + - name: react if linting errors were fixed + id: react-if-fixed + if: steps.commit-and-push.outcome == 'success' + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + comment-id: ${{ github.event.comment.id }} + reactions: hooray + + - name: react if linting errors were not fixed + if: steps.commit-and-push.outcome == 'failure' + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + comment-id: ${{ github.event.comment.id }} + reactions: confused + + - name: react if linting errors were not fixed + if: steps.commit-and-push.outcome == 'failure' + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + issue-number: ${{ github.event.issue.number }} + body: | + @${{ github.actor }} I tried to fix the linting errors, but it didn't work. Please fix them manually. + See [CI log](https://github.com/nf-core/circdna/actions/runs/${{ github.run_id }}) for more details. diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 888cb4bc..81cd098e 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -11,74 +11,35 @@ on: types: [published] jobs: - EditorConfig: + pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 - - - name: Install editorconfig-checker - run: npm install -g editorconfig-checker - - - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(find .* -type f | grep -v '.git\|.py\|.md\|json\|yml\|yaml\|html\|css\|work\|.nextflow\|build\|nf_core.egg-info\|log.txt\|Makefile') - - Prettier: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-node@v3 - - - name: Install Prettier - run: npm install -g prettier - - - name: Run Prettier --check - run: prettier --check ${GITHUB_WORKSPACE} - - PythonBlack: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - name: Check code lints with Black - uses: psf/black@stable - - # If the above check failed, post a comment on the PR explaining the failure - - name: Post PR comment - if: failure() - uses: mshick/add-pr-comment@v1 + - name: Set up Python 3.11 + uses: actions/setup-python@v5 with: - message: | - ## Python linting (`black`) is failing - - To keep the code consistent with lots of contributors, we run automated code consistency checks. - To fix this CI test, please run: - - * Install [`black`](https://black.readthedocs.io/en/stable/): `pip install black` - * Fix formatting errors in your pipeline: `black .` - - Once you push these changes the test should pass, and you can hide this comment :+1: + python-version: 3.11 + cache: "pip" - We highly recommend setting up Black in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! + - name: Install pre-commit + run: pip install pre-commit - Thanks again for your contribution! - repo-token: ${{ secrets.GITHUB_TOKEN }} - allow-repeats: false + - name: Run pre-commit + run: pre-commit run --all-files nf-core: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.11" architecture: "x64" - name: Install dependencies @@ -99,7 +60,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 0bbcd30f..147bcd10 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@v2 + uses: dawidd6/action-download-artifact@v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml new file mode 100644 index 00000000..21ac3f06 --- /dev/null +++ b/.github/workflows/release-announcements.yml @@ -0,0 +1,68 @@ +name: release-announcements +# Automatic release toot and tweet anouncements +on: + release: + types: [published] + workflow_dispatch: + +jobs: + toot: + runs-on: ubuntu-latest + steps: + - uses: rzr/fediverse-action@master + with: + access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} + host: "mstdn.science" # custom host if not "mastodon.social" (default) + # GitHub event payload + # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release + message: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + + send-tweet: + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install dependencies + run: pip install tweepy==4.14.0 + - name: Send tweet + shell: python + run: | + import os + import tweepy + + client = tweepy.Client( + access_token=os.getenv("TWITTER_ACCESS_TOKEN"), + access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), + consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), + consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), + ) + tweet = os.getenv("TWEET") + client.create_tweet(text=tweet) + env: + TWEET: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} + TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} + TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} + TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} + + bsky-post: + runs-on: ubuntu-latest + steps: + - uses: zentered/bluesky-post-action@v0.1.0 + with: + post: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + env: + BSKY_IDENTIFIER: ${{ secrets.BSKY_IDENTIFIER }} + BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }} + # diff --git a/.gitignore b/.gitignore index dbc46ce6..fcb4940e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ testing/ testing* *.pyc null +.vscode diff --git a/.gitpod.yml b/.gitpod.yml index 85d95ecc..acf72695 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,5 +1,12 @@ image: nfcore/gitpod:latest - +tasks: + - name: Update Nextflow and setup pre-commit + command: | + pre-commit install --install-hooks + nextflow self-update + - name: unset JAVA_TOOL_OPTIONS + command: | + unset JAVA_TOOL_OPTIONS vscode: extensions: # based on nf-core.nf-core-extensionpack - codezombiech.gitignore # Language support for .gitignore files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0c31cdb9..af57081f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,10 @@ repos: - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v2.7.1" + rev: "v3.1.0" hooks: - id: prettier + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python + rev: "2.7.3" + hooks: + - id: editorconfig-checker + alias: ec diff --git a/CHANGELOG.md b/CHANGELOG.md index fcbcdbdf..41557498 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,36 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v1.1.0 - [2024-02-03] + +### Credits + +Special thanks to the following for their input and contributions to the release: + +- [Jens Luebeck](https://github.com/jluebeck) +- [Simon Pearce](https://github.com/SPPearce) +- [Maxime U Garcia](https://github.com/maxulysse) +- [Alex M. Ascensión](https://github.com/alexmascension) + +### Enhancements & fixes + +- Nf-core template update to 2.11.1 + - update of nf-core modules versions +- Removed AmpliconArchitect and AmpliconClassifier modules with their respective scripts in /bin + - AmpliconArchitect and AmpliconClassifier is now run inside the AmpliconSuite-Pipeline. Additional scripts are not necessary. + - Removed respective configs and workflow code +- Added AmpliconSuite-Pipeline + - A wrapper for calling copy numbers, preparing amplified intervals, running AmpliconArchitect, and calling amplicon classes using AmpliconClassifier + - Added docker container named [PrepareAA](https://quay.iorepository/nf-core/prepareaa?tab=tags) to run AmpliconSuite-Pipeline with singualarity or docker + - Added module configs and description +- Changed `assets/multiqc_config.yml`to fit new pipeline version +- Included directory checks for `mosek_license_dir` and `aa_data_repo` . + - Removed both directory parameters in the test profile as it is only checked when running `ampliconarchitect` +- Updated `nextflow_schema.json` to give better details about how to use `--circle_identifier` +- made `--circle_identifier` an essential parameter +- made `--input_format` an essential parameter and removed the default value to request specification by user +- Updated `--bwa_index` to accept only directory paths to the bwa index files. Makes the user input easier to not need to deal with file endings and patterns. Bug identified by [Alex M. Ascensión](https://github.com/alexmascension) in + ## v1.0.4 - [2023-06-26] ### `Added` diff --git a/CITATIONS.md b/CITATIONS.md index da0b48b9..42c5051c 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,6 +12,8 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. @@ -40,7 +42,7 @@ > Talevich E, Shain AH, Botton T, Bastian BC. Cnvkit: genome-wide copy number detection and visualization from targeted dna sequencing. PLoS Comput Biol. 2016;12(4):e1004873. doi: 10.1371/journal.pcbi.1004873. PMID: 27100738; PMCID: PMC4839673. -- [PrepareAA](https://github.com/jluebeck/AmpliconSuite-pipeline) +- [AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) - [AmpliconArchitect](https://github.com/virajbdeshpande/AmpliconArchitect) @@ -48,7 +50,7 @@ - [AmpliconClassifier](https://github.com/jluebeck/AmpliconClassifier) - > Luebeck, Jens, Alvin Wei Tian Ng, Patricia C. Galipeau, Xiaohong Li, Carissa A. Sanchez, Annalise Katz-Summercorn, Hoon Kim et al. "Extrachromosomal DNA in the cancerous transformation of Barrett's esophagus." bioRxiv (2022): 2022-07. + > Luebeck J, Ng AWT, Galipeau PC, Li X, Sanchez CA, Katz-Summercorn AC, Kim H, Jammula S, He Y, Lippman SM, Verhaak RGW, Maley CC, Alexandrov LB, Reid BJ, Fitzgerald RC, Paulson TG, Chang HY, Wu S, Bafna V, Mischel PS. Extrachromosomal DNA in the cancerous transformation of Barrett's oesophagus. Nature. 2023 Apr;616(7958):798-805. doi: 10.1038/s41586-023-05937-5. Epub 2023 Apr 12. PMID: 37046089; PMCID: PMC10132967. - [Samblaster](https://github.com/GregoryFaust/samblaster) @@ -77,5 +79,8 @@ - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) + > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. + - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4fd052f..c089ec78 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,18 +1,20 @@ -# Code of Conduct at nf-core (v1.0) +# Code of Conduct at nf-core (v1.4) ## Our Pledge -In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: - Age +- Ability - Body size +- Caste - Familial status - Gender identity and expression - Geographical location - Level of experience - Nationality and national origins - Native language -- Physical and neurological ability +- Neurodiversity - Race or ethnicity - Religion - Sexual identity and orientation @@ -22,80 +24,133 @@ Please note that the list above is alphabetised and is therefore not ranked in a ## Preamble -> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. +:::note +This Code of Conduct (CoC) has been drafted by Renuka Kudva, Cris Tuñí, and Michael Heuer, with input from the nf-core Core Team and Susanna Marquez from the nf-core community. "We", in this document, refers to the Safety Officers and members of the nf-core Core Team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will be amended periodically to keep it up-to-date. In case of any dispute, the most current version will apply. +::: -An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). + +Our Safety Officers are Saba Nafees, Cris Tuñí, and Michael Heuer. nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. -We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. +We have therefore adopted this CoC, which we require all members of our community and attendees of nf-core events to adhere to in all our workspaces at all times. Workspaces include, but are not limited to, Slack, meetings on Zoom, gather.town, YouTube live etc. -Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. +Our CoC will be strictly enforced and the nf-core team reserves the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. -We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. +We ask all members of our community to help maintain supportive and productive workspaces and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. -Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re +Questions, concerns, or ideas on what we can include? Contact members of the Safety Team on Slack or email safety [at] nf-co [dot] re. ## Our Responsibilities -The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. +Members of the Safety Team (the Safety Officers) are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. -The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +The Safety Team, in consultation with the nf-core core team, have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this CoC, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. -Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. +Members of the core team or the Safety Team who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and will be subject to the same actions as others in violation of the CoC. -## When are where does this Code of Conduct apply? +## When and where does this Code of Conduct apply? -Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events, such as hackathons, workshops, bytesize, and collaborative workspaces on gather.town. These guidelines include, but are not limited to, the following (listed alphabetically and therefore in no order of preference): - Communicating with an official project email address. - Communicating with community members within the nf-core Slack channel. - Participating in hackathons organised by nf-core (both online and in-person events). -- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. -- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence, and on the nf-core gather.town workspace. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, gather.town, Jitsi, YouTube live etc. - Representing nf-core on social media. This includes both official and personal accounts. ## nf-core cares 😊 -nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include, but are not limited to, the following (listed in alphabetical order): - Ask for consent before sharing another community member’s personal information (including photographs) on social media. - Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. -- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Celebrate your accomplishments! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) - Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) - Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) - Focus on what is best for the team and the community. (When in doubt, ask) -- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Accept feedback, yet be unafraid to question, deliberate, and learn. - Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) -- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communication to be kind.**) - Take breaks when you feel like you need them. -- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) +- Use welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack) ## nf-core frowns on 😕 -The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this CoC. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces: - Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. - “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. - Spamming or trolling of individuals on social media. -- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. -- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. +- Use of sexual or discriminatory imagery, comments, jokes, or unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion, or work experience. ### Online Trolling -The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the risk of online trolling. This is unacceptable — reports of such behaviour will be taken very seriously and perpetrators will be excluded from activities immediately. -All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. +All community members are **required** to ask members of the group they are working with for explicit consent prior to taking screenshots of individuals during video calls. -## Procedures for Reporting CoC violations +## Procedures for reporting CoC violations If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. -You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). +You can reach out to members of the Safety Team (Saba Nafees, Cris Tuñí, and Michael Heuer) on Slack. Alternatively, contact a member of the nf-core core team [nf-core core team](https://nf-co.re/about), and they will forward your concerns to the Safety Team. + +Issues directly concerning members of the Core Team or the Safety Team will be dealt with by other members of the core team and the safety manager — possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson and details will be shared in due course. + +All reports will be handled with the utmost discretion and confidentiality. + +You can also report any CoC violations to safety [at] nf-co [dot] re. In your email report, please do your best to include: + +- Your contact information. +- Identifying information (e.g. names, nicknames, pseudonyms) of the participant who has violated the Code of Conduct. +- The behaviour that was in violation and the circumstances surrounding the incident. +- The approximate time of the behaviour (if different than the time the report was made). +- Other people involved in the incident, if applicable. +- If you believe the incident is ongoing. +- If there is a publicly available record (e.g. mailing list record, a screenshot). +- Any additional information. + +After you file a report, one or more members of our Safety Team will contact you to follow up on your report. + +## Who will read and handle reports + +All reports will be read and handled by the members of the Safety Team at nf-core. + +If members of the Safety Team are deemed to have a conflict of interest with a report, they will be required to recuse themselves as per our Code of Conduct and will not have access to any follow-ups. + +To keep this first report confidential from any of the Safety Team members, please submit your first report by direct messaging on Slack/direct email to any of the nf-core members you are comfortable disclosing the information to, and be explicit about which member(s) you do not consent to sharing the information with. + +## Reviewing reports + +After receiving the report, members of the Safety Team will review the incident report to determine whether immediate action is required, for example, whether there is immediate threat to participants’ safety. + +The Safety Team, in consultation with members of the nf-core core team, will assess the information to determine whether the report constitutes a Code of Conduct violation, for them to decide on a course of action. + +In the case of insufficient information, one or more members of the Safety Team may contact the reporter, the reportee, or any other attendees to obtain more information. -Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. +Once additional information is gathered, the Safety Team will collectively review and decide on the best course of action to take, if any. The Safety Team reserves the right to not act on a report. -All reports will be handled with utmost discretion and confidentially. +## Confidentiality + +All reports, and any additional information included, are only shared with the team of safety officers (and possibly members of the core team, in case the safety officer is in violation of the CoC). We will respect confidentiality requests for the purpose of protecting victims of abuse. + +We will not name harassment victims, beyond discussions between the safety officer and members of the nf-core team, without the explicit consent of the individuals involved. + +## Enforcement + +Actions taken by the nf-core’s Safety Team may include, but are not limited to: + +- Asking anyone to stop a behaviour. +- Asking anyone to leave the event and online spaces either temporarily, for the remainder of the event, or permanently. +- Removing access to the gather.town and Slack, either temporarily or permanently. +- Communicating to all participants to reinforce our expectations for conduct and remind what is unacceptable behaviour; this may be public for practical reasons. +- Communicating to all participants that an incident has taken place and how we will act or have acted — this may be for the purpose of letting event participants know we are aware of and dealing with the incident. +- Banning anyone from participating in nf-core-managed spaces, future events, and activities, either temporarily or permanently. +- No action. ## Attribution and Acknowledgements @@ -106,6 +161,22 @@ All reports will be handled with utmost discretion and confidentially. ## Changelog -### v1.0 - March 12th, 2021 +### v1.4 - February 8th, 2022 + +- Included a new member of the Safety Team. Corrected a typographical error in the text. + +### v1.3 - December 10th, 2021 + +- Added a statement that the CoC applies to nf-core gather.town workspaces. Corrected typographical errors in the text. + +### v1.2 - November 12th, 2021 + +- Removed information specific to reporting CoC violations at the Hackathon in October 2021. + +### v1.1 - October 14th, 2021 + +- Updated with names of new Safety Officers and specific information for the hackathon in October 2021. + +### v1.0 - March 15th, 2021 - Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/README.md b/README.md index ff6a4203..28fbecbb 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,17 @@ -# ![nf-core/circdna](docs/images/nf-core-circdna_logo_light.png#gh-light-mode-only) ![nf-core/circdna](docs/images/nf-core-circdna_logo_dark.png#gh-dark-mode-only) - -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/circdna/results) [![Cite with Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.6685250.svg)](https://doi.org/10.5281/zenodo.6685250) - -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) +

+ + + nf-core/circdna + +

+[![GitHub Actions CI Status](https://github.com/nf-core/circdna/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/circdna/actions?query=workflow%3A%22nf-core+CI%22) +[![GitHub Actions Linting Status](https://github.com/nf-core/circdna/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/circdna/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/circdna/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.8085422?labelColor=000000)](https://doi.org/10.5281/zenodo.8085422) + +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) +[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) +[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) +[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg)](https://sylabs.io/docs/) @@ -44,10 +53,8 @@ A graphical view of the pipeline and its diverse branches can be seen below. ## Usage -> **Note** -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how -> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) -> with `-profile test` before running the workflow on actual data. +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. First, prepare a samplesheet with your input data that looks as follows: @@ -72,7 +79,17 @@ Each row represents a pair of fastq files (paired end) or a single bam file gene Now, you can run the pipeline using: ```bash - nextflow run nf-core/circdna --input samplesheet.csv --outdir --genome GRCh38 -profile --circle_identifier + nextflow run nf-core/circdna --input samplesheet.csv --outdir --genome GRCh38 -profile --circle_identifier --input_format <"FASTQ"/"BAM"> +``` + +### Test AmpliconSuite-Pipeline with a test data-set + +To test the correct installation of the pipeline and the use of AmpliconArchitect inside the [AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline), a small WGS data set is uploaded to [AWS](https://aws.amazon.com/) and can be downloaded and used with the parameter `-profile test_AA_local`. You just need to specify your local paths to the `aa_data_repo` and the `mosek_license_dir`. See [AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) for information about the data repository and the Mosek license. To note, the Mosek license file needs to be named `mosek.lic` inside the `mosek_license_dir`. + +You can test the pipeline using: + +```bash + nextflow run nf-core/circdna -profile test_AA_local, --outdir --aa_data_repo --mosek_license_dir ``` ## Available ecDNA identifiers @@ -85,22 +102,21 @@ Please specify the parameter `circle_identifier` depending on the pipeline branc ### Identification of amplified ecDNAs with WGS data -> `ampliconarchitect` uses [AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) +> `ampliconarchitect` uses [AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) inside the [AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) ### De novo assembly of ecDNAs with Circle-seq data > `unicycler` uses [Unicycler](https://github.com/rrwick/Unicycler) for de novo assembly of ecDNAs and [Minimap2](https://github.com/lh3/minimap2) for accurate mapping of the identified circular sequences. -> **Warning:** -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those -> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> [!WARNING] +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). -For more details, please refer to the [usage documentation](https://nf-co.re/circdna/usage) and the [parameter documentation](https://nf-co.re/circdna/parameters). +For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/circdna/usage) and the [parameter documentation](https://nf-co.re/circdna/parameters). ## Pipeline output -To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/circdna/results) tab on the nf-core website pipeline page. +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/circdna/results) tab on the nf-core website pipeline page. For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/circdna/output). diff --git a/assets/email_template.html b/assets/email_template.html index 8d153ad4..de9928d1 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -12,7 +12,7 @@ -

nf-core/circdna v${version}

+

nf-core/circdna ${version}

Run Name: $runName

<% if (!success){ diff --git a/assets/email_template.txt b/assets/email_template.txt index 19d70481..35c87aa7 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -4,7 +4,7 @@ |\\ | |__ __ / ` / \\ |__) |__ } { | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, `._,._,' - nf-core/circdna v${version} + nf-core/circdna ${version} ---------------------------------------------------- Run Name: $runName diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 418eb9b5..ecac477c 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,17 +3,21 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/circdna Methods Description" section_href: "https://github.com/nf-core/circdna" plot_type: "html" -## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using nf-core/circdna v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

Data was processed using nf-core/circdna v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}
+

${tool_citations}

References

    -
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • -
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. doi: 10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
  • +
  • Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
  • +
  • da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
  • + ${tool_bibliography}
Notes:
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 08bed135..597bff6f 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/circdna + This report has been generated by the nf-core/circdna analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-circdna-methods-description": order: -1000 diff --git a/assets/nf-core-circdna_logo_light.png b/assets/nf-core-circdna_logo_light.png index 7146e281..87a4925d 100644 Binary files a/assets/nf-core-circdna_logo_light.png and b/assets/nf-core-circdna_logo_light.png differ diff --git a/assets/slackreport.json b/assets/slackreport.json index 043d02f2..aa69f8e3 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_name": "nf-core/circdna ${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/bin/AmpliconArchitect.py b/bin/AmpliconArchitect.py deleted file mode 100755 index 6846a21d..00000000 --- a/bin/AmpliconArchitect.py +++ /dev/null @@ -1,499 +0,0 @@ -#!/usr/bin/env python - - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Maintained by Jens Luebeck, jluebeck@ucsd.edu -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - - -from time import time - -TSTART = time() -import numpy as np -import pysam -import argparse -import sys -import os -import matplotlib -import copy - -matplotlib.use("Agg") -import logging -from functools import reduce - - -if sys.version_info >= (3, 0): - from io import StringIO -else: - from cStringIO import StringIO - -import global_names - -__version__ = "1.3.r5" - -parser = argparse.ArgumentParser(description="Reconstruct Amplicons connected to listed intervals.") -parser.add_argument( - "--bed", - dest="rdAlts", - help="Bed file with putative list of amplified intervals", - metavar="FILE", - action="store", - type=str, - required=True, -) -parser.add_argument( - "--bam", - dest="bam", - help="Coordinate sorted BAM file with index.", - metavar="FILE", - action="store", - type=str, - required=True, -) -parser.add_argument( - "-o", - "--out", - dest="outName", - help="Prefix for output files", - metavar="FILE", - action="store", - type=str, - nargs=1, - required=True, -) -parser.add_argument( - "--runmode", - dest="runmode", - help="Values: [FULL/BPGRAPH/CYCLES/SVVIEW]. This option determines which stages of AA will be run. FULL: Run the full reconstruction including breakpoint graph, cycles as well as SV visualization. BPGRAPH: Only reconstruct the breakpoint graph and estimate copy counts, but do not reconstruct the amplicon cycles. CYCLES: Only reconstruct the breakpoint graph and cycles, but do not create the output for SV visualization. SVVIEW: Only create the SV visualization, but do not reconstruct the breakpoint graph or cycles", - metavar="STR", - action="store", - type=str, - default="FULL", -) -parser.add_argument( - "--extendmode", - dest="extendmode", - help="Values: [EXPLORE/CLUSTERED/UNCLUSTERED/VIRAL]. This determines how the input intervals in bed file are treated. EXPLORE : Search for all connected intervals in genome that may be connected to input intervals. CLUSTERED : Input intervals are treated as part of a single connected amplicon and no new connected intervals are added. UNCLUSTERED : Each input interval is treated as a distinct single interval amplicon and no new intervals are added.", - metavar="STR", - action="store", - type=str, - default="EXPLORE", -) -parser.add_argument( - "--sensitivems", - dest="sensitivems", - help='Values: [True, False]. Set "True" only if expected copy counts to vary by orders of magnitude, .e.g viral integration. Default: False', - metavar="STR", - action="store", - type=str, - default="False", -) -parser.add_argument( - "--plotstyle", - dest="plotstyle", - help='Values: [small large, all_amplicons]. "small": small font, "all_amplicons": display a large number of intervals in a single plot, recommeded for visualizing multiple amplicons in CLUSTERED mode. Default: "large"', - metavar="STR", - action="store", - type=str, - default="small", -) -parser.add_argument( - "--ref", - dest="ref", - help='Values: [hg19, GRCh37, GRCh38, GRCh38_viral, mm10, GRCm38]. "hg19", "GRCh38", "mm10" : chr1, .. chrM etc / "GRCh37", "GRCm38" : \'1\', \'2\', .. \'MT\' etc/ "None" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected.', - metavar="STR", - action="store", - type=str, - choices=["hg19", "GRCh37", "GRCh38", "GRCh38_viral", "mm10", "GRCm38"], - required=True, -) -parser.add_argument( - "--downsample", - dest="downsample", - help="Values: [-1, 0, C(>0)]. Decide how to downsample the bamfile during reconstruction. Reads are automatically downsampled in real time for speedup. Alternatively pre-process bam file using $AA_SRC/downsample.py. -1 : Do not downsample bam file, use full coverage. 0 (default): Downsample bamfile to 10X coverage if original coverage larger then 10. C (>0) : Downsample bam file to coverage C if original coverage larger than C", - metavar="FLOAT", - action="store", - type=float, - default=0, -) -parser.add_argument( - "--cbam", - dest="cbam", - help="Optional bamfile to use for coverage calculation", - metavar="FILE", - action="store", - type=str, - default=None, -) -parser.add_argument( - "--cbed", - dest="cbed", - help="Optional bedfile defining 1000 10kbp genomic windows for coverage calcualtion", - metavar="FILE", - action="store", - type=str, - default=None, -) -parser.add_argument( - "--insert_sdevs", - dest="insert_sdevs", - help="Number of standard deviations around the insert size. May need to increase for sequencing runs with high variance after insert size selection step. (default 3.0)", - metavar="FLOAT", - action="store", - type=float, - default=3, -) -parser.add_argument( - "--pair_support_min", - dest="pair_support_min", - help="Number of read pairs for minimum breakpoint support (default 2 but typically becomes higher due to coverage-scaled cutoffs)", - metavar="INT", - action="store", - type=int, - default=2, -) -parser.add_argument( - "--no_cstats", - dest="no_cstats", - help="Do not re-use coverage statistics from coverage.stats.", - action="store_true", - default=False, -) -parser.add_argument( - "--random_seed", - dest="random_seed", - help="Set flag to use the numpy default random seed (sets np.random.seed(seed=None)), otherwise will use seed=0", - action="store_true", - default=False, -) - -parser.add_argument( - "-v", "--version", action="version", version="AmpliconArchitect version {version} \n".format(version=__version__) -) - -args = parser.parse_args() -global_names.REF = args.ref -global_names.TSTART = TSTART -if args.random_seed: - global_names.SEED = None - - -logging.basicConfig(filename=args.outName[0] + ".log", level=logging.DEBUG) -logging.getLogger("fontTools.subset").level = logging.WARN - -# # output logs to stdout -root = logging.getLogger() -# root.setLevel(logging.DEBUG) -ch = logging.StreamHandler(sys.stdout) -ch.setLevel(logging.INFO) -formatter = logging.Formatter("[%(name)s:%(levelname)s]\t%(message)s") -ch.setFormatter(formatter) -root.addHandler(ch) -summary_logger = logging.getLogger("summary") -summary_logger.propagate = False -summary_logger.addHandler(logging.FileHandler(args.outName[0] + "_summary.txt", "w")) -graph_logger = logging.getLogger("graph") -graph_logger.propagate = False -cycle_logger = logging.getLogger("cycle") -cycle_logger.propagate = False - - -class PrefixAdapter(logging.LoggerAdapter): - def process(self, msg, kwargs): - return "[%s] %s" % (self.extra["prefix"], msg), kwargs - - -commandstring = "Commandline: " - -for arg in sys.argv: - if " " in arg: - commandstring += '"{}" '.format(arg) - else: - commandstring += "{} ".format(arg) - -logging.info(commandstring) - -logging.info("AmpliconArchitect version " + __version__ + "\n") -logging.info("Python version " + sys.version + "\n") -rdAlts = args.rdAlts -if os.path.splitext(args.bam)[-1] == ".cram": - bamFile = pysam.Samfile(args.bam, "rc") -else: - bamFile = pysam.Samfile(args.bam, "rb") -outName = args.outName[0] -cbam = None -if args.cbam is not None: - if os.path.splitext(args.cbam)[-1] == ".cram": - cbam = pysam.Samfile(args.cbam, "rc") - else: - cbam = pysam.Samfile(args.cbam, "rb") -cbed = args.cbed -try: - DATA_REPO = os.environ["AA_DATA_REPO"] -except: - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + "unable to set AA_DATA_REPO variable. Setting to working directory" - ) - DATA_REPO = "." -if DATA_REPO == "." or DATA_REPO == "": - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + "AA_DATA_REPO not set or empy. Setting to working directory" - ) - DATA_REPO = "." - - -logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Loading libraries and reference annotations for: " + args.ref) -import ref_util as hg -import bam_to_breakpoint as b2b - -logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Initiating bam_to_breakpoint object for: " + args.bam) -rdList0 = hg.interval_list(rdAlts, "bed", exclude_info_string=True) -rdList = hg.interval_list([r for r in rdList0]) -cb = bamFile -if cbam is not None: - cb = cbam - -cstats = None -if args.no_cstats: - logging.info( - "#TIME " + "%.3f\t" % (time() - TSTART) + "--no_cstats was set. Will not attempt to re-use coverage.stats info" - ) - -if os.path.exists(os.path.join(hg.DATA_REPO, "coverage.stats")) and not args.no_cstats: - coverage_stats_file = open(os.path.join(hg.DATA_REPO, "coverage.stats")) - for l in coverage_stats_file: - ll = l.strip().split() - if not ll: - continue - bamfile_pathname = str(cb.filename.decode()) - if ll[0] == os.path.abspath(bamfile_pathname): - bamfile_filesize = os.path.getsize(bamfile_pathname) - cstats = tuple(map(float, ll[1:])) - if len(cstats) < 15 or int(round(cstats[11])) < args.pair_support_min: - cstats = None - elif cstats[13] != args.insert_sdevs or bamfile_filesize != int(cstats[14]) or any(np.isnan(cstats)): - cstats = None - - coverage_stats_file.close() - -if cstats: - logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "Reusing cstats from " - + str(os.path.join(hg.DATA_REPO, "coverage.stats")) - ) -else: - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + "cstats not found, generating coverage statistics... ") - - -coverage_windows = None -if cbed is not None: - coverage_windows = hg.interval_list(cbed, "bed") - coverage_windows.sort() -if cstats is None and cbam is not None: - cbam2b = b2b.bam_to_breakpoint( - cbam, - sample_name=outName, - num_sdevs=args.insert_sdevs, - pair_support_min=args.pair_support_min, - coverage_stats=cstats, - coverage_windows=coverage_windows, - ) - cstats = cbam2b.basic_stats -bamFileb2b = b2b.bam_to_breakpoint( - bamFile, - sample_name=outName, - num_sdevs=args.insert_sdevs, - pair_support_min=args.pair_support_min, - coverage_stats=cstats, - coverage_windows=coverage_windows, - downsample=args.downsample, - sensitivems=(args.sensitivems == "True"), - span_coverage=(args.cbam is None), - tstart=TSTART, -) - - -segments = [] -# segments=hg.interval_list(rdAlts.replace('.bed', '_segments.bed'), 'bed') - -# bandsfile="karyotype.HK359.EGFR.txt" -# segments = [(l[2], hg.interval(l[1], int(l[4]), int(l[5])).intersection(i), l[6]) for l in [ll.strip().split() for ll in open(bandsfile) if 'band' in ll and ll.strip().split()[1][:3] == 'chr'] if hg.interval(l[1], int(l[4]), int(l[5])).intersects(i)] -# segments = [('', hg.interval(l[1], int(l[4]), int(l[5])), l[6]) for l in [ll.strip().split() for ll in open(bandsfile) if 'band' in ll and ll.strip().split()[1][:3] == 'chr']] - - -if args.extendmode == "VIRAL": - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Finding integration sites: " + str(rdList[0])) - de = bamFileb2b.interval_discordant_edges(rdList) - old_stdout = sys.stdout - sys.stdout = mystdout = StringIO() - amplist = bamFileb2b.interval_hops(rdList, explore=False) - alist = hg.interval_list( - [hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos) for e in de] - + [hg.interval(e[0].v2.chrom, e[0].v2.pos, e[0].v2.pos) for e in de] - + rdList - ) - alist.sort() - rdList = hg.interval_list( - [ - i[0] - for i in alist.merge_clusters(extend=5000000) - if len(hg.interval_list([i[0]]).intersection(amplist) + hg.interval_list([i[0]]).intersection(rdList)) > 0 - ] - ) - rdList = hg.interval_list( - [ - hg.interval(i.chrom, max(0, i.start - 10000), min(i.end + 10000, hg.chrLen[hg.chrNum(i.chrom)])) - for i in rdList - ] - ) - iout = open(outName + ".integration_search.out", "w") - iout.write(mystdout.getvalue()) - iout.close() - sys.stdout = old_stdout - -all_ilist = copy.copy(rdList) -irdhops = [] -irddict = {} -irdSets = set([frozenset([ird]) for ird in rdList]) -irdgroupdict = {ird: frozenset([ird]) for ird in rdList} -if args.extendmode == "EXPLORE" or args.extendmode == "VIRAL": - for ird in rdList: - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Exploring interval: " + str(ird)) - old_stdout = sys.stdout - sys.stdout = mystdout = StringIO() - ilist = bamFileb2b.interval_hops(ird, rdlist=all_ilist) - irdhops.append((ird, ilist)) - for i in ilist: - irddict[i] = ird - # iout = open(outName + '.' + ird.chrom + ":" + str(ird.start) + '-' + str(ird.end) + '.out', 'w') - # iout.write(mystdout.getvalue()) - # iout.close() - sys.stdout = old_stdout - all_ilist += ilist - all_ilist.sort() - - allhops = hg.interval_list(reduce(lambda x, y: x + y, [irdh[1] for irdh in irdhops], [])) - allhops.sort() - allmerge = allhops.merge_clusters() - for am in allmerge: - nset = set() - for ami in am[1]: - nset.update(irdgroupdict[irddict[ami]]) - if irdgroupdict[irddict[ami]] in irdSets: - irdSets.remove(irdgroupdict[irddict[ami]]) - for ird in nset: - irdgroupdict[ird] = nset - irdSets.add(frozenset(nset)) - irdgroups = [] - for nset in irdSets: - ngroup = hg.interval_list([]) - for am in allmerge: - if irddict[am[1][0]] in nset: - ngroup.append(am[0]) - ngroup.sort() - irdgroups.append(ngroup) - - # TODO: Sort the irdgroups by minimum chrom and minimum coord here - irdgroups.sort() - # irdgroup_min_chrom_pos = [] - # for group in irdgroups: - # for x - -elif args.extendmode == "CLUSTERED" or args.extendmode == "VIRAL_CLUSTERED": - irdgroups = [rdList] -else: - irdgroups = [hg.interval_list([r]) for r in rdList] - - -logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Interval sets for amplicons determined: ") -for il in enumerate(irdgroups): - logging.info( - "[amplicon" - + str(il[0] + 1) - + "]\t" - + ",".join([i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in il[1]]) - ) - -summary_logger.info("#Amplicons = " + str(len(irdgroups))) -summary_logger.info("-----------------------------------------------------------------------------------------") - -if args.extendmode == "VIRAL": - amplicon_id = 0 -else: - amplicon_id = 1 - -for ig in irdgroups: - ilist = ig - ird = ig[0] - old_stdout = sys.stdout - sys.stdout = mystdout = StringIO() - adapter = PrefixAdapter(summary_logger, {"prefix": str(amplicon_id)}) - summaryFormatter = logging.Formatter("[amplicon" + str(amplicon_id) + "] %(message)s") - for handler in summary_logger.handlers: - handler.setFormatter(summaryFormatter) - summary_logger.info("AmpliconID = " + str(amplicon_id)) - summary_logger.info("#Intervals = " + str(len(ilist))) - ilist1 = hg.interval_list([a[0] for a in ilist.merge_clusters()]) - istr = ",".join([i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in ilist1]) - summary_logger.info("Intervals = " + str(istr)) - oncolist = ",".join(set([a[1].info["Name"] for a in ilist1.intersection(hg.oncogene_list)])) + "," - summary_logger.info("OncogenesAmplified = " + str(oncolist)) - amplicon_name = outName + "_amplicon" + str(amplicon_id) - if args.runmode in ["FULL", "CYCLES", "BPGRAPH"]: - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Reconstructing amplicon" + str(amplicon_id)) - graph_handler = logging.FileHandler(amplicon_name + "_graph.txt", "w") - cycle_handler = logging.FileHandler(amplicon_name + "_cycles.txt", "w") - graph_logger.addHandler(graph_handler) - cycle_logger.addHandler(cycle_handler) - bamFileb2b.interval_filter_vertices(ilist, amplicon_name=amplicon_name, runmode=args.runmode) - graph_logger.removeHandler(graph_handler) - cycle_logger.removeHandler(cycle_handler) - if args.runmode in ["FULL", "SVVIEW"]: - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Plotting SV View for amplicon" + str(amplicon_id)) - bamFileb2b.plot_segmentation(ilist, amplicon_name, segments=segments, font=args.plotstyle) - summary_logger.info("-----------------------------------------------------------------------------------------") - iout = open(amplicon_name + "_logs.txt", "w") - iout.write(mystdout.getvalue()) - iout.close() - sys.stdout = old_stdout - amplicon_id += 1 - continue - - -if (args.extendmode in ["VIRAL", "VIRAL_CLUSTERED"]) and (args.runmode in ["FULL", "SVVIEW", "VIRALVIEW"]): - amplicon_id = 1 - for i in irdgroups[0]: - if i.intersects(rdList0[-1]) or len(hg.interval_list([i]).intersection(rdList)) == 0: - continue - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Plotting viral view for interval " + str(i)) - bamFileb2b.plot_segmentation( - hg.interval_list([i, rdList0[-1]]), - outName + "_amplicon" + str(amplicon_id), - scale_list=hg.interval_list([i]), - font="large", - ) - amplicon_id += 1 - - -logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Total Runtime") diff --git a/bin/Coverage.py b/bin/Coverage.py old mode 100644 new mode 100755 diff --git a/bin/GroupedAnalysis.py b/bin/GroupedAnalysis.py deleted file mode 100755 index 3e1c1945..00000000 --- a/bin/GroupedAnalysis.py +++ /dev/null @@ -1,369 +0,0 @@ -#!/usr/bin/env python3 - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bedk - -import argparse -from datetime import datetime -import json -import os -import random -from subprocess import * -import sys -import time -import threading - -PAA_PATH = os.path.dirname(os.path.realpath(__file__)) + "/PrepareAA.py" - - -def generate_individual_seeds(cmd_dict, aa_py, odir, cnv_bed_dict): - individual_seed_dct = {} - print("Generating individual seeds") - for sname, argstring in cmd_dict.items(): - with open(sname + "_CNV_out.txt", "w") as outfile: - cmd = "{} {}{}".format(aa_py, PAA_PATH, argstring) - print(sname) - print(cmd + "\n") - call(cmd, stdout=outfile, stderr=outfile, shell=True) - - # if it was a seeds file, PAA won't modify, so move it into the right location - if sname in cnv_bed_dict and cnv_bed_dict[sname].endswith("AA_CNV_SEEDS.bed"): - cmd = "cp {} {}/".format(cnv_bed_dict[sname], odir) - call(cmd, shell=True) - - # store the name of the path of the seeds file - individual_seed_dct[sname] = "{}/{}_AA_CNV_SEEDS.bed".format(odir, sname) - - return individual_seed_dct - - -def group_seeds(individual_seed_dct, odir): - samplist = list(individual_seed_dct.keys()) - outname = odir + "_".join(samplist[:2]) - if len(samplist) > 2: - outname += "_etc_n" + str(len(samplist)) - - outname += "_merged_AA_CNV_SEEDS.bed" - - bedlist = " ".join(individual_seed_dct.values()) - print("Merging seeds") - cmd = "sort -k1,1 -k2,2n {} | bedtools merge -i - > {}".format(bedlist, outname) - print(cmd) - call(cmd, shell=True) - return outname - - -def launch_AA_AC(jobq, aa_py, PAA_PATH): - try: - sname, arg_string = jobq.pop() - - except IndexError: - return - - with open(sname + "_AA_AC_out.txt", "w") as outfile: - time.sleep(random.uniform(0, 0.75)) - cmd = "{} {}{}".format(aa_py, PAA_PATH, arg_string) - print("\nLaunching AA+AC job for " + sname + "\n" + cmd) - call(cmd, stdout=outfile, stderr=outfile, shell=True) - - -def create_AA_AC_cmds(tumor_lines, base_argstring, grouped_seeds): - cmd_dict = dict() - for tf in tumor_lines: - curr_argstring = "{} --run_AA --run_AC -s {} --bam {} --bed {}".format( - base_argstring, tf[0], tf[1], grouped_seeds - ) - - optionals = zip( - [ - "--sample_metadata", - ], - tf[4:], - ) - for k, v in optionals: - if v: - curr_argstring += " {} {}".format(k, v) - - cmd_dict[tf[0]] = curr_argstring - - return cmd_dict - - -# convert the parsed group input data to PrepareAA commands -def create_CNV_cmds(tumor_lines, normal_lines, base_argstring, cnvkit_dir): - if not normal_lines: - normalbam = None - - else: - normalbam = normal_lines[0] - if len(normal_lines) > 1: - print("More than one normal sample specified. Only the first will be used: " + normalbam[0]) - - cmd_dict = dict() - cnv_bed_dict = dict() - for tf in tumor_lines: - curr_argstring = "{} -s {} --bam {}".format(base_argstring, tf[0], tf[1]) - if normalbam: - curr_argstring += " --normal_bam {}".format(normalbam[1]) - - optionals = zip(["--cnv_bed", "--sample_metadata"], tf[3:]) - for k, v in optionals: - if v: - curr_argstring += " {} {}".format(k, v) - if k == "--cnv_bed": - cnv_bed_dict[tf[0]] = v - - if "--cnv_bed" not in curr_argstring and cnvkit_dir: - curr_argstring += " --cnvkit_dir " + cnvkit_dir - - # if QC is desired it will be done during stage 3 - if "--no_QC" not in curr_argstring: - curr_argstring += " --no_QC" - - cmd_dict[tf[0]] = curr_argstring - - return cmd_dict, cnv_bed_dict - - -def make_base_argstring(arg_dict, stop_at_seeds=False): - base_argstring = "" - for k, v in arg_dict.items(): - if v is True: - if k != "no_AA": - arg = " --" + k - base_argstring += arg - - elif v is not False and not k == "input" and not k == "cnvkit_dir": - arg = " --{} {}".format(k, str(v)) - base_argstring += arg - - return base_argstring - - -# read a file providing the group data -def read_group_data(input_file): - """ - group data is formatted as follows: - sample_name bam_file sample_type - where 'sample_type' is either 'tumor' or 'normal' - additional optional fields are as follows: - cnv_calls sample_metadata_json - """ - tumor_lines = [] - normal_lines = [] - with open(input_file) as infile: - for line in infile: - if line.startswith("#"): - continue - - fields = line.rstrip().rsplit() - if not fields: - continue - - for ind, v in enumerate(fields): - if v.upper() == "NA" or v.upper() == "NONE": - fields[ind] = None - - if fields[2].lower() == "tumor": - tumor_lines.append(fields) - - elif fields[2].lower() == "normal": - normal_lines.append(fields) - - else: - sys.stderr.write( - "Input formatting error! Column 3 must either be 'tumor' or 'normal'.\nSee README for " - "group input formatting instructions.\n\n" - ) - sys.exit(1) - - return tumor_lines, normal_lines - - -def get_argdict(args): - arg_dict = dict() - for arg in vars(args): - value = getattr(args, arg) - if value is not None and value != "": - arg_dict[arg] = value - - return arg_dict - - -# MAIN # -if __name__ == "__main__": - # Parses the command line arguments - parser = argparse.ArgumentParser( - description="A pipeline wrapper for AmpliconArchitect, invoking alignment CNV calling and CNV filtering prior. " - "Can launch AA, as well as downstream amplicon classification." - ) - parser.add_argument( - "-i", - "--input", - help="Input file providing the multi-sample information. See README for " - "information on how to format the input file.", - required=True, - ) - parser.add_argument( - "-o", "--output_directory", help="output directory names (will create if not already created)", required=True - ) - # parser.add_argument("-s", "--sample_name", help="sample name", required=True) - parser.add_argument( - "-t", - "--nthreads", - help="Number of threads to use in BWA, CNV calling and concurrent " "instances of PAA", - type=int, - required=True, - ) - parser.add_argument( - "--no_AA", help="Only produce the union of seeds for the group. Do not run AA/AC", action="store_true" - ) - # parser.add_argument("--run_AA", help="Run AA after all files prepared. Default off.", action='store_true') - # parser.add_argument("--run_AC", help="Run AmpliconClassifier after all files prepared. Default off.", - # action='store_true') - parser.add_argument( - "--ref", - help="Reference genome version.", - choices=["hg19", "GRCh37", "GRCh38", "hg38", "mm10", "GRCm38", "GRCh38_viral"], - ) - parser.add_argument("--cngain", type=float, help="CN gain threshold to consider for AA seeding", default=4.5) - parser.add_argument( - "--cnsize_min", type=int, help="CN interval size (in bp) to consider for AA seeding", default=50000 - ) - parser.add_argument("--downsample", type=float, help="AA downsample argument (see AA documentation)", default=10) - parser.add_argument( - "--use_old_samtools", - help="Indicate you are using an old build of samtools (prior to version " "1.0)", - action="store_true", - default=False, - ) - parser.add_argument( - "--rscript_path", - help="Specify custom path to Rscript, if needed when using CNVKit " "(which requires R version >3.4)", - ) - parser.add_argument("--python3_path", help="If needed, specify a custom path to python3.") - parser.add_argument( - "--aa_python_interpreter", - help="By default PrepareAA will use the system's default python path. If you would like to use " - "a different python version with AA, set this to either the path to the interpreter or " - "'python3' or 'python2'", - type=str, - default="python", - ) - # parser.add_argument("--freebayes_dir", - # help="Path to directory where freebayes executable exists (not the path to the executable " - # "itself). Only needed if using Canvas and freebayes is not installed on system path.") - # parser.add_argument("--vcf", help="VCF (in Canvas format, i.e., \"PASS\" in filter field, AD field as 4th entry of " - # "FORMAT field). When supplied with \"--sorted_bam\", pipeline will start from Canvas CNV stage." - # ) - parser.add_argument("--AA_src", help="Specify a custom $AA_SRC path. Overrides the bash variable") - parser.add_argument( - "--AA_runmode", - help="If --run_AA selected, set the --runmode argument to AA. Default mode is " "'FULL'", - choices=["FULL", "BPGRAPH", "CYCLES", "SVVIEW"], - default="FULL", - ) - parser.add_argument( - "--AA_extendmode", - help="If --run_AA selected, set the --extendmode argument to AA. Default " "mode is 'EXPLORE'", - choices=["EXPLORE", "CLUSTERED", "UNCLUSTERED", "VIRAL"], - default="EXPLORE", - ) - parser.add_argument( - "--AA_insert_sdevs", - help="Number of standard deviations around the insert size. May need to " - "increase for sequencing runs with high variance after insert size " - "selection step. (default 3.0)", - type=float, - default=3.0, - ) - # parser.add_argument("--normal_bam", help="Path to matched normal bam for CNVKit (optional)") - # parser.add_argument("--ploidy", type=float, help="Ploidy estimate for CNVKit (optional). This is not used outside " - # "of CNVKit.", default=None) - # parser.add_argument("--purity", type=float, help="Tumor purity estimate for CNVKit (optional). This is not used " - # "outside of CNVKit.", default=None) - parser.add_argument( - "--cnvkit_segmentation", - help="Segmentation method for CNVKit (if used), defaults to CNVKit " "default segmentation method (cbs).", - choices=["cbs", "haar", "hmm", "hmm-tumor", "hmm-germline", "none"], - default="cbs", - ) - parser.add_argument( - "--no_filter", help="Do not run amplified_intervals.py to identify amplified seeds", action="store_true" - ) - parser.add_argument("--no_QC", help="Skip QC on the BAM file.", action="store_true") - parser.add_argument("--skip_AA_on_normal_bam", help="Skip running AA on the normal bam", action="store_true") - # parser.add_argument("--sample_metadata", help="Path to a JSON of sample metadata to build on") - - # group = parser.add_mutually_exclusive_group(required=True) - # group.add_argument("--sorted_bam", "--bam", help="Coordinate sorted BAM file (aligned to an AA-supported " - # "reference.)") - # group.add_argument("--fastqs", help="Fastq files (r1.fq r2.fq)", nargs=2) - # group.add_argument("--completed_AA_runs", - # help="Path to a directory containing one or more completed AA runs which utilized the same reference genome.") - - # group2 = parser.add_mutually_exclusive_group() - # group2.add_argument("--cnv_bed", "--bed", - # help="BED file (or CNVKit .cns file) of CNV changes. Fields in the bed file should" - # " be: chr start end name cngain") - parser.add_argument( - "--cnvkit_dir", - help="Path to cnvkit.py. Assumes CNVKit is on the system path if not set. " "Not needed if --bed is given.", - ) - # group2.add_argument("--completed_run_metadata", - # help="Run metadata JSON to retroactively assign to collection of samples", default="") - # group2.add_argument("--align_only", help="Only perform the alignment stage (do not run CNV calling and seeding", - # action='store_true') - - args = parser.parse_args() - - if args.output_directory and not args.output_directory.endswith("/"): - args.output_directory += "/" - - if not args.aa_python_interpreter: - args.aa_python_interpreter = "python" - - arg_dict = get_argdict(args) - tumor_lines, normal_lines = read_group_data(args.input) - print("Found {} tumor samples and {} normals\n".format(str(len(tumor_lines)), str(len(normal_lines)))) - - # Stage 1: iterate over and launch each that needs CN calling. collect CN seeds files - base_argstring = make_base_argstring(arg_dict, stop_at_seeds=True) - print("Setting base argstring for Stage 1 as:") - print(base_argstring + "\n") - cmd_dict, cnv_bed_dict = create_CNV_cmds(tumor_lines, normal_lines, base_argstring, args.cnvkit_dir) - individual_seed_dct = generate_individual_seeds( - cmd_dict, args.aa_python_interpreter, args.output_directory, cnv_bed_dict - ) - - # Stage 2: merge seeds (bedtools - gotta sort and merge), and get new args - grouped_seeds = group_seeds(individual_seed_dct, args.output_directory) - - # Stage 3: launch each AA job in parallel - if not args.no_AA: - if args.skip_AA_on_normal_bam: - normal_lines = [] - - all_lines = normal_lines + tumor_lines - cmd_dict = create_AA_AC_cmds(all_lines, base_argstring, grouped_seeds) - threadL = [] - paa_threads = min(args.nthreads, len(all_lines)) - print("\nQueueing " + str(len(all_lines)) + " PAA jobs") - jobq = [] - for i in range(len(all_lines)): - sname = all_lines[i][0] - cmd_string = cmd_dict[sname] - jobq.append((sname, cmd_string)) - - for i in range(paa_threads): - threadL.append(threading.Thread(target=launch_AA_AC, args=(jobq, args.aa_python_interpreter, PAA_PATH))) - # threadL.append(workerThread(i, launch_AA_AC, cmd_string, args.aa_python_interpreter, PAA_PATH, sname)) - threadL[i].start() - - for t in threadL: - t.join() - - print("All jobs completed") diff --git a/bin/PrepareAA.py b/bin/PrepareAA.py deleted file mode 100755 index 28a946b0..00000000 --- a/bin/PrepareAA.py +++ /dev/null @@ -1,1088 +0,0 @@ -#!/usr/bin/env python - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - -import argparse -from datetime import datetime -import json -import logging -import os -import socket -from subprocess import * -import sys -import time - -import check_reference -import cnv_prefilter - -__version__ = "0.1537.2" - -PY3_PATH = "python3" # updated by command-line arg if specified -metadata_dict = {} # stores the run metadata (bioinformatic metadata) -sample_info_dict = {} # stores the sample metadata - - -def run_bwa(ref_fasta, fastqs, outdir, sname, nthreads, samtools, usingDeprecatedSamtools=False): - outname = outdir + sname - logging.info("Output prefix: " + outname) - logging.info("Checking for ref index") - exts = [".sa", ".amb", ".ann", ".pac", ".bwt"] - indexPresent = True - for i in exts: - if not os.path.exists(ref_fasta + i): - indexPresent = False - logging.info( - "Could not find " + ref_fasta + i + ", building BWA index from scratch. This could take > 60 minutes" - ) - break - - if not indexPresent: - cmd = "bwa index " + ref_fasta - call(cmd, shell=True) - - print("\nPerforming alignment and sorting") - if usingDeprecatedSamtools: - cmd = "{{ bwa mem -K 10000000 -t {} {} {} | {} view -Shu - | {} sort -m 4G -@4 - {}.cs; }} 2>{}_aln_stage.stderr".format( - nthreads, ref_fasta, fastqs, samtools, samtools, outname, outname - ) - else: - cmd = "{{ bwa mem -K 10000000 -t {} {} {} | {} view -Shu - | {} sort -m 4G -@4 -o {}.cs.bam -; }} 2>{}_aln_stage.stderr".format( - nthreads, ref_fasta, fastqs, samtools, samtools, outname, outname - ) - - logging.info(cmd) - call(cmd, shell=True) - metadata_dict["bwa_cmd"] = cmd - logging.info("\nPerforming duplicate removal & indexing") - cmd_list = [samtools, "rmdup", "-s", "{}.cs.bam".format(outname), "{}.cs.rmdup.bam".format(outname)] - # cmd_list = [samtools, "markdup", "-s", "-@ {}".format(nthreads), "{}.cs.bam".format(outname), {}.cs.rmdup.bam".format(outname)] - - logging.info(" ".join(cmd_list)) - call(cmd_list) - logging.info("\nRunning samtools index") - cmd_list = [samtools, "index", "{}.cs.rmdup.bam".format(outname)] - logging.info(" ".join(cmd_list)) - call(cmd_list) - logging.info("Removing temp BAM") - cmd = "rm {}.cs.bam".format(outname) - call(cmd, shell=True) - return outname + ".cs.rmdup.bam", outname + "_aln_stage.stderr" - - -def run_freebayes(ref, bam_file, outdir, sname, nthreads, regions, fb_path=None): - # Freebayes cmd-line args - # -f is fasta - # -r is region to call - logging.info("Running freebayes...") - fb_exec = "freebayes" - if fb_path: - fb_exec = fb_path + "/" + fb_exec - while True: - try: - curr_region_tup = regions.pop() - except IndexError: - break - - curr_region_string = curr_region_tup[0] + ":" + curr_region_tup[1] - logging.info(curr_region_string + ". " + str(len(regions)) + " items remaining.") - vcf_file = outdir + sname + "_" + curr_region_tup[0] + "_" + curr_region_tup[2] + ".vcf" - replace_filter_field_func = ( - 'awk \'{ if (substr($1,1,1) != "#" ) { $7 = ($7 == "." ? "PASS" : $7 ) }} 1 \' OFS="\\t"' - ) - cmd = "{} --genotype-qualities --standard-filters --use-best-n-alleles 5 --limit-coverage 25000 \ - --strict-vcf -f {} -r {} {} | {} > {}".format( - fb_exec, ref, curr_region_string, bam_file, replace_filter_field_func, vcf_file - ) - logging.info(cmd) - call(cmd, shell=True) - # gzip the new VCF - call("gzip -f " + vcf_file, shell=True) - - -def run_cnvkit(ckpy_path, nthreads, outdir, bamfile, seg_meth="cbs", normal=None, ref_fasta=None, vcf=None): - # CNVkit cmd-line args - # -m wgs: wgs data - # -y: assume chrY present - # -n: create flat reference (cnv baseline) - # -p: number of threads - # -f: reference genome fasta - bamBase = os.path.splitext(os.path.basename(bamfile))[0] - cnvkit_version = Popen([PY3_PATH, ckpy_path, "version"], stdout=PIPE, stderr=PIPE).communicate()[0].rstrip() - try: - cnvkit_version = cnvkit_version.decode("utf-8") - except UnicodeError: - pass - - metadata_dict["cnvkit_version"] = cnvkit_version - - ckRef = AA_REPO + args.ref + "/" + args.ref + "_cnvkit_filtered_ref.cnn" - logging.info("\nRunning CNVKit batch") - if normal: - # create a version of the stripped reference - scripts_dir = os.path.dirname(os.path.abspath(__file__)) + "/scripts/" - strip_cmd = "python {}reduce_fasta.py -r {} -c {} -o {}".format( - scripts_dir, ref_fasta, ref_genome_size_file, outdir - ) - call(strip_cmd, shell=True) - base = os.path.basename(ref_fasta) # args.ref is the name, ref is the fasta - stripRefG = outdir + os.path.splitext(base)[0] + "_reduced" + "".join(os.path.splitext(base)[1:]) - logging.debug("Stripped reference: " + stripRefG) - - cmd = "{} {} batch {} -m wgs --fasta {} -p {} -d {} --normal {}".format( - PY3_PATH, ckpy_path, bamfile, stripRefG, nthreads, outdir, normal - ) - else: - cmd = "{} {} batch -m wgs -r {} -p {} -d {} {}".format(PY3_PATH, ckpy_path, ckRef, nthreads, outdir, bamfile) - - logging.info(cmd) - call(cmd, shell=True) - metadata_dict["cnvkit_cmd"] = cmd + " ; " - rscript_str = "" - if args.rscript_path: - rscript_str = "--rscript-path " + args.rscript_path - logging.info("Set Rscript flag: " + rscript_str) - - cnrFile = outdir + bamBase + ".cnr" - cnsFile = outdir + bamBase + ".cns" - logging.info("\nRunning CNVKit segment") - # TODO: possibly include support for adding VCF calls. - cmd = "{} {} segment {} {} -p {} -m {} -o {}".format( - PY3_PATH, ckpy_path, cnrFile, rscript_str, nthreads, seg_meth, cnsFile - ) - logging.info(cmd) - exit_code = call(cmd, shell=True) - if exit_code != 0: - logging.error("CNVKit encountered a non-zero exit status. Exiting...\n") - sys.exit(1) - - metadata_dict["cnvkit_cmd"] = metadata_dict["cnvkit_cmd"] + cmd - logging.info("\nCleaning up temporary files") - cmd = "rm -f {}/*tmp.bed {}/*.cnn {}/*target.bed {}/*.bintest.cns".format(outdir, outdir, outdir, outdir) - logging.info(cmd) - call(cmd, shell=True) - cmd = "gzip -f " + cnrFile - logging.info(cmd) - call(cmd, shell=True) - if normal: - cmd = "rm " + stripRefG + " " + stripRefG + ".fai" - logging.info(cmd) - call(cmd, shell=True) - - -def merge_and_filter_vcfs(chr_names, vcf_list, outdir, sname): - logging.info("\nMerging VCFs and zipping") - # collect the vcf files to merge - merged_vcf_file = outdir + sname + "_merged.vcf" - relevant_vcfs = [x for x in vcf_list if any([i in x for i in chr_names])] - chrom_vcf_d = {} - for f in relevant_vcfs: - curr_chrom = f.rsplit(".vcf.gz")[0].rsplit("_")[-2:] - chrom_vcf_d[curr_chrom[0] + curr_chrom[1]] = f - - # chr_nums = [x.lstrip("chr") for x in chr_names] - pre_chr_str_names = [str(x) for x in range(1, 23)] + ["X", "Y"] - - # sort the elements - # include the header from the first one - if args.ref != "GRCh37" and args.ref != "GRCm38": - sorted_chr_names = ["chr" + str(x) for x in pre_chr_str_names] - cmd = "zcat " + chrom_vcf_d["chrM"] + """ | awk '$4 != "N"' > """ + merged_vcf_file - - else: - sorted_chr_names = [str(x) for x in pre_chr_str_names] - cmd = "zcat " + chrom_vcf_d["MT"] + """ | awk '$4 != "N"' > """ + merged_vcf_file - - logging.info(cmd) - call(cmd, shell=True) - - # zcat the rest, grepping out all header lines starting with "#" - logging.debug(sorted_chr_names) - for i in sorted_chr_names: - if i == "chrM" or i == "MT": - continue - - cmd_p = "zcat " + chrom_vcf_d[i + "p"] + """ | grep -v "^#" | awk '$4 != "N"' >> """ + merged_vcf_file - cmd_q = "zcat " + chrom_vcf_d[i + "q"] + """ | grep -v "^#" | awk '$4 != "N"' >> """ + merged_vcf_file - logging.info(cmd_p) - call(cmd_p, shell=True) - logging.info(cmd_q) - call(cmd_q, shell=True) - - cmd = "gzip -f " + merged_vcf_file - logging.info(cmd) - call(cmd, shell=True) - - return merged_vcf_file + ".gz" - - -# Read the CNVkit .cns files -def convert_cnvkit_cns_to_bed(cnvkit_output_directory, base, cnsfile=None, rescaled=False, nofilter=False): - if cnsfile is None: - if not rescaled: - cnsfile = cnvkit_output_directory + base + ".cns" - else: - cnsfile = cnvkit_output_directory + base + "_rescaled.cns" - - with open(cnsfile) as infile, open(cnvkit_output_directory + base + "_CNV_CALLS.bed", "w") as outfile: - head = next(infile).rstrip().rsplit("\t") - for line in infile: - fields = line.rstrip().rsplit("\t") - # s, e = int(fields[1]), int(fields[2]) - cn_r = float(fields[4]) - cn = 2 ** (cn_r + 1) - # do not filter on size since amplified_intervals.py will merge small ones. - outline = "\t".join(fields[0:3] + ["CNVkit", str(cn)]) + "\n" - outfile.write(outline) - - return cnvkit_output_directory + base + "_CNV_CALLS.bed" - - -def rescale_cnvkit_calls(ckpy_path, cnvkit_output_directory, base, cnsfile=None, ploidy=None, purity=None): - if purity is None and ploidy is None: - logging.warning("Warning: Rescaling called without --ploidy or --purity. Rescaling will have no effect.") - if cnsfile is None: - cnsfile = cnvkit_output_directory + base + ".cns" - - if purity < 0.4: - logging.warning("WARNING! Rescaling a low purity sample may cause many false-positive seed regions!") - - cmd = "{} {} call {} -m clonal".format(PY3_PATH, ckpy_path, cnsfile) - if purity: - cmd += " --purity " + str(purity) - if ploidy: - cmd += " --ploidy " + str(ploidy) - - cmd += " -o " + cnvkit_output_directory + base + "_rescaled.cns" - logging.info("Rescaling CNVKit calls\n" + cmd) - call(cmd, shell=True) - - -def run_amplified_intervals( - AA_interpreter, CNV_seeds_filename, sorted_bam, output_directory, sname, cngain, cnsize_min -): - logging.info("\nRunning amplified_intervals") - AA_seeds_filename = "{}_AA_CNV_SEEDS".format(output_directory + sname) - cmd = "{} {}/amplified_intervals.py --ref {} --bed {} --bam {} --gain {} --cnsize_min {} --out {}".format( - AA_interpreter, - AA_SRC, - args.ref, - CNV_seeds_filename, - sorted_bam, - str(cngain), - str(cnsize_min), - AA_seeds_filename, - ) - - logging.info(cmd) - exit_code = call(cmd, shell=True) - if exit_code != 0: - logging.error("amplified_intervals.py returned a non-zero exit code. Exiting...\n") - sys.exit(1) - - metadata_dict["amplified_intervals_cmd"] = cmd - return AA_seeds_filename + ".bed" - - -def run_AA( - AA_interpreter, - amplified_interval_bed, - sorted_bam, - AA_outdir, - sname, - downsample, - ref, - runmode, - extendmode, - insert_sdevs, -): - AA_version = ( - Popen([AA_interpreter, AA_SRC + "/AmpliconArchitect.py", "--version"], stdout=PIPE, stderr=PIPE) - .communicate()[1] - .rstrip() - ) - try: - AA_version = AA_version.decode("utf-8") - except UnicodeError: - pass - - metadata_dict["AA_version"] = AA_version - - cmd = "{} {}/AmpliconArchitect.py --ref {} --downsample {} --bed {} --bam {} --runmode {} --extendmode {} --out {}/{}".format( - AA_interpreter, - AA_SRC, - ref, - str(downsample), - amplified_interval_bed, - sorted_bam, - runmode, - extendmode, - AA_outdir, - sname, - ) - if insert_sdevs is not None: - cmd += " --insert_sdevs {}".format(str(insert_sdevs)) - - logging.info(cmd) - aa_exit_code = call(cmd, shell=True) - if aa_exit_code != 0: - logging.error("AmpliconArchitect returned a non-zero exit code. Exiting...\n") - sys.exit(1) - - metadata_dict["AA_cmd"] = cmd - - -def run_AC(AA_outdir, sname, ref, AC_outdir, AC_src): - logging.info("\nRunning AC") - # make input file - class_output = AC_outdir + sname - input_file = class_output + ".input" - bed_dir = class_output + "_classification_bed_files/" - if os.path.exists(bed_dir): - logging.warning( - "WARNING! AC files were not cleared prior to re-running. New classifications may become " - "mixed with previous classification files!" - ) - - cmd = "{}/make_input.sh {} {}".format(AC_src, AA_outdir, class_output) - logging.info(cmd) - call(cmd, shell=True) - - # run AC on input file - with open(input_file) as ifile: - sample_info_dict["number_of_AA_amplicons"] = len(ifile.readlines()) - - cmd = "{} {}/amplicon_classifier.py -i {} --ref {} -o {} --report_complexity".format( - PY3_PATH, AC_src, input_file, ref, class_output - ) - logging.info(cmd) - call(cmd, shell=True) - metadata_dict["AC_cmd"] = cmd - - # Get AC version - AC_version = ( - Popen([PY3_PATH, AC_src + "/amplicon_classifier.py", "--version"], stdout=PIPE, stderr=PIPE) - .communicate()[0] - .rstrip() - ) - try: - AC_version = AC_version.decode("utf-8") - except UnicodeError: - pass - - metadata_dict["AC_version"] = AC_version - - # iterate over the bed files and count anything that isn't "unknown" as a feature - feat_count = 0 - if os.path.exists(bed_dir): - for bf in os.listdir(bed_dir): - if not "unknown" in bf and bf.endswith(".bed"): - feat_count += 1 - - sample_info_dict["number_of_AA_features"] = feat_count - - -def make_AC_table(sname, AC_outdir, AC_src, run_metadata_file, sample_metadata_file, cnv_bed=None): - # make the AC output table - class_output = AC_outdir + sname - input_file = class_output + ".input" - summary_map_file = class_output + "_summary_map.txt" - classification_file = class_output + "_amplicon_classification_profiles.tsv" - cmd = "{} {}/make_results_table.py -i {} --classification_file {} --summary_map {}".format( - PY3_PATH, AC_src, input_file, classification_file, summary_map_file - ) - - if cnv_bed: - cmd += " --cnv_bed " + cnv_bed - - if run_metadata_file: - cmd += " --run_metadata_file " + run_metadata_file - - if sample_metadata_file: - cmd += " --sample_metadata_file " + sample_metadata_file - - logging.info(cmd) - call(cmd, shell=True) - - -def get_ref_sizes(ref_genome_size_file): - chr_sizes = {} - with open(ref_genome_size_file) as infile: - for line in infile: - fields = line.rstrip().rsplit() - if fields: - chr_sizes[fields[0]] = str(int(fields[1]) - 1) - - return chr_sizes - - -def get_ref_centromeres(ref_name): - centromere_dict = {} - fnameD = { - "GRCh38": "GRCh38_centromere.bed", - "GRCh37": "human_g1k_v37_centromere.bed", - "hg19": "hg19_centromere.bed", - "mm10": "mm10_centromere.bed", - "GRCm38": "GRCm38_centromere.bed", - "GRCh38_viral": "GRCh38_centromere.bed", - } - with open(AA_REPO + ref_name + "/" + fnameD[ref_name]) as infile: - for line in infile: - if not "centromere" in line and not "acen" in line: - continue - fields = line.rstrip().rsplit("\t") - if fields[0] not in centromere_dict: - centromere_dict[fields[0]] = (fields[1], fields[2]) - - else: - pmin = min(int(centromere_dict[fields[0]][0]), int(fields[1])) - pmax = max(int(centromere_dict[fields[0]][1]), int(fields[2])) - # pad with 20kb to avoid freebayes issues in calling near centromeres - centromere_dict[fields[0]] = (str(pmin - 20000), str(pmax + 20000)) - - return centromere_dict - - -def save_run_metadata(outdir, sname, args, launchtime, commandstring): - # make a dictionary that stores - # datetime - # hostname - # ref - # PAA command - # AA python interpreter version - # bwa cmd - # CN cmd - # AA cmd - # PAA version - # CNVKit version - # AA version - # AC version - metadata_dict["launch_datetime"] = launchtime - metadata_dict["hostname"] = socket.gethostname() - metadata_dict["ref_genome"] = args.ref - aapint = args.aa_python_interpreter if args.aa_python_interpreter else "python" - aa_python_v = Popen([aapint, "--version"], stdout=PIPE, stderr=PIPE).communicate()[1].rstrip() - try: - aa_python_v = aa_python_v.decode("utf-8") - except UnicodeError: - pass - - metadata_dict["AA_python_version"] = aa_python_v - - metadata_dict["PAA_command"] = commandstring - metadata_dict["PAA_version"] = __version__ - - for x in [ - "bwa_cmd", - "cnvkit_cmd", - "amplified_intervals_cmd", - "AA_cmd", - "AC_cmd", - "cnvkit_version", - "AA_version", - "AC_version", - ]: - if x not in metadata_dict: - metadata_dict[x] = "NA" - - # save the json dict - run_metadata_filename = outdir + sname + "_run_metadata.json" - with open(run_metadata_filename, "w") as fp: - json.dump(metadata_dict, fp, indent=2) - - # sample_info_dict["run_metadata_file"] = run_metadata_filename - return run_metadata_filename - - -def detect_run_failure(align_stderr_file, AA_outdir, sname, AC_outdir): - if align_stderr_file: - cmd = "grep -i error " + align_stderr_file - try: - aln_errs = check_output(cmd, shell=True).decode("utf-8") - - except CalledProcessError: - aln_errs = "" - - if aln_errs: - logging.error("Detected error during bwa mem alignment stage\n") - return True - - if AA_outdir: - sumfile = AA_outdir + sname + "_summary.txt" - if os.path.isfile(sumfile): - namps = -1 - with open(sumfile) as infile: - for line in infile: - if line.startswith("#Amplicons = "): - namps = int(line.rstrip().rsplit(" = ")[-1]) - break - - if namps < 0: - logging.error("Detected truncated or missing AA outputs") - return True - - for x in range(1, namps + 1): - try: - fsize = os.stat(AA_outdir + sname + "_amplicon" + str(x) + "_cycles.txt").st_size - - except OSError: - fsize = 0 - - if fsize == 0: - logging.error("Detected truncated or missing AA outputs") - return True - - else: - logging.error("Detected error during AA stage") - return True - - if AC_outdir: - try: - fsize1 = os.stat(AC_outdir + sname + "_amplicon_classification_profiles.tsv").st_size - fsize2 = os.stat(AC_outdir + sname + "_result_table.tsv").st_size - - except OSError: - fsize1 = 0 - fsize2 = 0 - - if fsize1 == 0 or fsize2 == 0: - logging.error("Detected error during AC stage\n") - return True - - return False - - -# MAIN # -if __name__ == "__main__": - # Parses the command line arguments - parser = argparse.ArgumentParser( - description="A pipeline wrapper for AmpliconArchitect, invoking alignment CNV calling and CNV filtering prior. " - "Can launch AA, as well as downstream amplicon classification." - ) - parser.add_argument("-o", "--output_directory", help="output directory names (will create if not already created)") - parser.add_argument("-s", "--sample_name", help="sample name", required=True) - parser.add_argument("-t", "--nthreads", help="Number of threads to use in BWA and CNV calling", required=True) - parser.add_argument("--run_AA", help="Run AA after all files prepared. Default off.", action="store_true") - parser.add_argument( - "--run_AC", help="Run AmpliconClassifier after all files prepared. Default off.", action="store_true" - ) - parser.add_argument( - "--ref", - help="Reference genome version.", - choices=["hg19", "GRCh37", "GRCh38", "hg38", "mm10", "GRCm38", "GRCh38_viral"], - ) - parser.add_argument("--cngain", type=float, help="CN gain threshold to consider for AA seeding", default=4.5) - parser.add_argument( - "--cnsize_min", type=int, help="CN interval size (in bp) to consider for AA seeding", default=50000 - ) - parser.add_argument("--downsample", type=float, help="AA downsample argument (see AA documentation)", default=10) - parser.add_argument( - "--use_old_samtools", - help="Indicate you are using an old build of samtools (prior to version " "1.0)", - action="store_true", - default=False, - ) - parser.add_argument( - "--rscript_path", - help="Specify custom path to Rscript, if needed when using CNVKit " "(which requires R version >3.4)", - ) - parser.add_argument("--python3_path", help="If needed, specify a custom path to python3.") - parser.add_argument( - "--aa_python_interpreter", - help="By default PrepareAA will use the system's default python path. If you would like to use " - "a different python version with AA, set this to either the path to the interpreter or " - "'python3' or 'python2'", - type=str, - default="python", - ) - # parser.add_argument("--freebayes_dir", - # help="Path to directory where freebayes executable exists (not the path to the executable " - # "itself). Only needed if using Canvas and freebayes is not installed on system path.") - # parser.add_argument("--vcf", help="VCF (in Canvas format, i.e., \"PASS\" in filter field, AD field as 4th entry of " - # "FORMAT field). When supplied with \"--sorted_bam\", pipeline will start from Canvas CNV stage." - # ) - parser.add_argument("--AA_src", help="Specify a custom $AA_SRC path. Overrides the bash variable") - parser.add_argument( - "--AA_runmode", - help="If --run_AA selected, set the --runmode argument to AA. Default mode is " "'FULL'", - choices=["FULL", "BPGRAPH", "CYCLES", "SVVIEW"], - default="FULL", - ) - parser.add_argument( - "--AA_extendmode", - help="If --run_AA selected, set the --extendmode argument to AA. Default " "mode is 'EXPLORE'", - choices=["EXPLORE", "CLUSTERED", "UNCLUSTERED", "VIRAL"], - default="EXPLORE", - ) - parser.add_argument( - "--AA_insert_sdevs", - help="Number of standard deviations around the insert size. May need to " - "increase for sequencing runs with high variance after insert size selection step. (default " - "3.0)", - type=float, - default=None, - ) - parser.add_argument("--normal_bam", help="Path to matched normal bam for CNVKit (optional)") - parser.add_argument( - "--ploidy", - type=float, - help="Ploidy estimate for CNVKit (optional). This is not used outside of CNVKit.", - default=None, - ) - parser.add_argument( - "--purity", - type=float, - help="Tumor purity estimate for CNVKit (optional). This is not used outside of CNVKit.", - default=None, - ) - parser.add_argument( - "--cnvkit_segmentation", - help="Segmentation method for CNVKit (if used), defaults to CNVKit " "default segmentation method (cbs).", - choices=["cbs", "haar", "hmm", "hmm-tumor", "hmm-germline", "none"], - default="cbs", - ) - parser.add_argument( - "--no_filter", help="Do not run amplified_intervals.py to identify amplified seeds", action="store_true" - ) - parser.add_argument( - "--no_QC", - help="Skip QC on the BAM file. Do not adjust AA insert_sdevs for " "poor-quality insert size distribution", - action="store_true", - ) - parser.add_argument("--sample_metadata", help="Path to a JSON of sample metadata to build on") - parser.add_argument( - "-v", "--version", action="version", version="PrepareAA version {version} \n".format(version=__version__) - ) - parser.add_argument( - "--samtools_path", - help="Path to samtools binary (e.g., /path/to/my/samtools). If unset, will use samtools on system path.", - default="", - ) - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument( - "--sorted_bam", "--bam", help="Coordinate sorted BAM file (aligned to an AA-supported " "reference.)" - ) - group.add_argument("--fastqs", help="Fastq files (r1.fq r2.fq)", nargs=2) - group.add_argument( - "--completed_AA_runs", - help="Path to a directory containing one or more completed AA runs which utilized the same reference genome.", - ) - group2 = parser.add_mutually_exclusive_group() - group2.add_argument( - "--cnv_bed", - "--bed", - help="BED file (or CNVKit .cns file) of CNV changes. Fields in the bed file should" - " be: chr start end name cngain", - ) - group2.add_argument( - "--cnvkit_dir", help="Path to cnvkit.py. Assumes CNVKit is on the system path if not set", default="" - ) - group2.add_argument( - "--completed_run_metadata", - help="Run metadata JSON to retroactively assign to collection of samples", - default="", - ) - group2.add_argument( - "--align_only", help="Only perform the alignment stage (do not run CNV calling and seeding", action="store_true" - ) - - # start timing - ta = time.time() - ti = ta - launchtime = str(datetime.now()) - args = parser.parse_args() - - # set an output directory if user did not specify - if not args.output_directory: - args.output_directory = os.getcwd() - - if not args.output_directory.endswith("/"): - args.output_directory += "/" - - sname = args.sample_name - outdir = args.output_directory - sample_metadata_filename = args.output_directory + sname + "_sample_metadata.json" - - # set samtools for use, 20230428 - if not args.samtools_path.endswith("/samtools"): - if args.samtools_path and not args.samtools_path.endswith("/"): - args.samtools_path += "/" - args.samtools_path += "samtools" - - # Make and clear necessary directories. - # make the output directory location if it does not exist - if not os.path.exists(args.output_directory): - os.mkdir(args.output_directory) - - # initiate logging - paa_logfile = args.output_directory + sname + ".log" - logging.basicConfig(filename=paa_logfile, format="[%(name)s:%(levelname)s]\t%(message)s", level=logging.INFO) - logging.getLogger().addHandler(logging.StreamHandler()) - logging.info("Launched on " + launchtime) - logging.info("AmpiconSuite-pipeline version " + __version__ + "\n") - - commandstring = "" - for arg in sys.argv: - if " " in arg: - commandstring += '"{}" '.format(arg) - else: - commandstring += "{} ".format(arg) - - logging.info(commandstring + "\n") - - if "/" in args.sample_name: - logging.error("Sample name -s cannot be a path. Specify output directory with -o.\n") - sys.exit(1) - - finish_flag_filename = args.output_directory + args.sample_name + "_finish_flag.txt" - if os.path.exists(finish_flag_filename): - logging.warning( - "WARNING: Running PrepareAA.py with outputs directed into the same exact output prefix may " - "cause crashes or other unexpected behavior. To avoid errors, clear previous files before " - "re-running.\n" - ) - - with open(finish_flag_filename, "w") as ffof: - ffof.write("UNSUCCESSFUL\n") - - timing_logfile = open(args.output_directory + args.sample_name + "_timing_log.txt", "w") - timing_logfile.write("#stage:\twalltime(seconds)\n") - - # Check if expected system paths and files are present. Check if provided argument combinations are valid. - if args.AA_src: - os.environ["AA_SRC"] = args.AA_src - - # Check if AA_REPO set, print error and quit if not - try: - AA_REPO = os.environ["AA_DATA_REPO"] + "/" - - except KeyError: - logging.error("AA_DATA_REPO bash variable not found. AmpliconArchitect may not be properly installed.\n") - sys.exit(1) - - if not os.path.exists(os.path.join(AA_REPO, "coverage.stats")): - logging.info("coverage.stats file not found in " + AA_REPO + "\nCreating a new coverage.stats file.") - cmd = "touch {}coverage.stats && chmod a+rw {}coverage.stats".format(AA_REPO, AA_REPO) - logging.info(cmd) - call(cmd, shell=True) - - try: - AA_SRC = os.environ["AA_SRC"] - - except KeyError: - logging.error("AA_SRC bash variable not found. AmpliconArchitect may not be properly installed.\n") - sys.exit(1) - - if (args.fastqs or args.completed_AA_runs) and not args.ref: - logging.error("Must specify --ref when providing unaligned fastq files.\n") - sys.exit(1) - - if args.completed_run_metadata.lower() == "none": - args.completed_run_metadata = None - - # if not these args are set, assume cnvkit.py is on the path. - if not (args.cnv_bed or args.cnvkit_dir or args.completed_run_metadata or args.align_only) and ( - args.fastqs or args.sorted_bam - ): - try: - args.cnvkit_dir = str(check_output(["which cnvkit.py"], shell=True).decode("utf-8").rstrip()) - - except CalledProcessError: - logging.error("cnvkit.py not found on system path. Must specify --cnvkit_dir") - sys.exit(1) - - elif args.cnvkit_dir and not args.cnvkit_dir.endswith("/") and not args.cnvkit_dir.endswith("cnvkit.py"): - args.cnvkit_dir += "/" - - else: - args.completed_run_metadata = None - - if not args.cnvkit_dir.endswith("cnvkit.py"): - args.cnvkit_dir += "cnvkit.py" - - if args.run_AA: - if not os.path.exists(os.environ["HOME"] + "/mosek/mosek.lic") and not "MOSEKLM_LICENSE_FILE" in os.environ: - logging.error("--run_AA set, but MOSEK license not found!") - sys.exit(1) - - elif "MOSEKLM_LICENSE_FILE" in os.environ and not os.path.exists( - os.environ["MOSEKLM_LICENSE_FILE"] + "/mosek.lic" - ): - logging.error("--run_AA set, but MOSEK license not found!") - sys.exit(1) - - runCNV = None - if args.cnvkit_dir and not args.cnv_bed: - runCNV = "CNVkit" - # check Rscript version - test_rscript = "Rscript" - if args.rscript_path: - if not args.rscript_path.endswith("/Rscript"): - args.rscript_path += "/Rscript" - - test_rscript = args.rscript_path - - try: - rscript_version_out = str(check_output([test_rscript, "--version"], stderr=STDOUT).decode("utf-8").rstrip()) - - except CalledProcessError: - logging.error(test_rscript + " not found. Must specify --rscript_path") - sys.exit(1) - - if args.python3_path: - if not args.python3_path.endswith("/python") and not args.python3_path.endswith("/python3"): - args.python3_path += "/python3" - - PY3_PATH = args.python3_path - - refFnames = {x: None for x in ["hg19", "GRCh37", "GRCh38", "GRCh38_viral", "mm10"]} - # Paths of all the repo files needed - if args.ref == "hg38": - args.ref = "GRCh38" - if args.ref == "GRCm38": - args.ref = "mm10" - - for rname in refFnames.keys(): - if os.path.exists(AA_REPO + "/" + rname): - refFnames[rname] = check_reference.get_ref_fname(AA_REPO, rname) - - faidict = {} - if args.sorted_bam: - if args.ref and refFnames[args.ref]: - faidict[args.ref] = AA_REPO + args.ref + "/" + refFnames[args.ref] + ".fai" - - elif args.ref and refFnames[args.ref] is None: - em = ( - "Data repo files for ref " + args.ref + " not found. Please download from " - "https://datasets.genepattern.org/?prefix=data/module_support_files/AmpliconArchitect/\n" - ) - logging.error(em) - sys.stderr.write(em) - sys.exit(1) - - else: - for k, v in refFnames.items(): - if v: - faidict[k] = AA_REPO + k + "/" + v + ".fai" - - determined_ref = check_reference.check_ref(args.sorted_bam, faidict, args.samtools_path) - if not determined_ref and not args.ref: - logging.error("Please make sure AA data repo is populated.") - sys.exit(1) - - elif not args.ref: - args.ref = determined_ref - - elif args.ref and not determined_ref: - logging.warning("WARNING! The BAM file did not match " + args.ref) - - gdir = AA_REPO + args.ref + "/" - ref_fasta = gdir + refFnames[args.ref] - ref_genome_size_file = gdir + args.ref + "_noAlt.fa.fai" - removed_regions_bed = gdir + args.ref + "_merged_centromeres_conserved_sorted.bed" - # ploidy_vcf = gdir + "dummy_ploidy.vcf" - if not os.path.isfile(removed_regions_bed): - logging.debug(str(os.listdir(gdir)) + "\n") - logging.error("PrepareAA data repo files not found in AA data repo. Please update your data repo.\n") - sys.exit(1) - - elif args.cnv_bed and not os.path.isfile(args.cnv_bed): - logging.error("Specified CNV bed file does not exist: " + args.cnv_bed + "\n") - sys.exit(1) - - if not args.sample_metadata: - args.sample_metadata = os.path.dirname(os.path.realpath(__file__)) + "/sample_metadata_skeleton.json" - - with open(args.sample_metadata) as input_json: - sample_info_dict = json.load(input_json) - - sample_info_dict["reference_genome"] = args.ref - sample_info_dict["sample_name"] = sname - - tb = time.time() - timing_logfile.write("Initialization:\t" + "{:.2f}".format(tb - ta) + "\n") - ta = tb - logging.info("Running PrepareAA on sample: " + sname) - # Begin PrepareAA pipeline - aln_stage_stderr = None - if args.fastqs: - # Run BWA - fastqs = " ".join(args.fastqs) - logging.info("Will perform alignment on " + fastqs) - args.sorted_bam, aln_stage_stderr = run_bwa( - ref_fasta, fastqs, outdir, sname, args.nthreads, args.samtools_path, args.use_old_samtools - ) - - if not args.completed_AA_runs: - bamBaiNoExt = args.sorted_bam[:-3] + "bai" - cramCraiNoExt = args.sorted_bam[:-4] + "crai" - baiExists = os.path.isfile(args.sorted_bam + ".bai") or os.path.isfile(bamBaiNoExt) - craiExists = os.path.isfile(args.sorted_bam + ".crai") or os.path.isfile(cramCraiNoExt) - if not baiExists and not craiExists: - logging.info(args.sorted_bam + " index not found, calling samtools index") - call([args.samtools_path, "index", args.sorted_bam]) - logging.info("Finished indexing") - - bambase = os.path.splitext(os.path.basename(args.sorted_bam))[0] - prop_paired_proportion = None - if not args.no_QC: - logging.debug("samtools path is set to: " + args.samtools_path) - prop_paired_proportion = check_reference.check_properly_paired(args.sorted_bam, args.samtools_path) - - tb = time.time() - timing_logfile.write("Alignment, indexing and QC:\t" + "{:.2f}".format(tb - ta) + "\n") - - if args.align_only: - logging.info("Completed\n") - tf = time.time() - timing_logfile.write("Total_elapsed_walltime\t" + "{:.2f}".format(tf - ti) + "\n") - timing_logfile.close() - sys.exit() - - ta = tb - centromere_dict = get_ref_centromeres(args.ref) - chr_sizes = get_ref_sizes(ref_genome_size_file) - # coordinate CNV calling - if runCNV == "CNVkit": - cnvkit_output_directory = args.output_directory + sname + "_cnvkit_output/" - if not os.path.exists(cnvkit_output_directory): - os.mkdir(cnvkit_output_directory) - - run_cnvkit( - args.cnvkit_dir, - args.nthreads, - cnvkit_output_directory, - args.sorted_bam, - seg_meth=args.cnvkit_segmentation, - normal=args.normal_bam, - ref_fasta=ref_fasta, - ) - if args.ploidy or args.purity: - rescale_cnvkit_calls( - args.cnvkit_dir, cnvkit_output_directory, bambase, ploidy=args.ploidy, purity=args.purity - ) - rescaling = True - else: - rescaling = False - - args.cnv_bed = convert_cnvkit_cns_to_bed(cnvkit_output_directory, bambase, rescaled=rescaling) - - if args.cnv_bed.endswith(".cns"): - args.cnv_bed = convert_cnvkit_cns_to_bed(outdir, bambase, cnsfile=args.cnv_bed, nofilter=True) - - tb = time.time() - timing_logfile.write("CNV calling:\t" + "{:.2f}".format(tb - ta) + "\n") - ta = tb - - sample_info_dict["sample_cnv_bed"] = args.cnv_bed - - if not args.no_filter and not args.cnv_bed.endswith("_AA_CNV_SEEDS.bed"): - if not args.cnv_bed.endswith("_CNV_CALLS_pre_filtered.bed"): - args.cnv_bed = cnv_prefilter.prefilter_bed( - args.cnv_bed, args.ref, centromere_dict, chr_sizes, args.cngain, args.output_directory - ) - - amplified_interval_bed = run_amplified_intervals( - args.aa_python_interpreter, args.cnv_bed, args.sorted_bam, outdir, sname, args.cngain, args.cnsize_min - ) - - else: - logging.info("Skipping filtering of bed file.") - amplified_interval_bed = args.cnv_bed - - tb = time.time() - timing_logfile.write("Seed filtering (amplified_intervals.py):\t" + "{:.2f}".format(tb - ta) + "\n") - ta = tb - - # Run AA - if args.run_AA: - AA_outdir = outdir + sname + "_AA_results/" - if not os.path.exists(AA_outdir): - os.mkdir(AA_outdir) - - # set the insert sdevs if not given by user. - if ( - not args.no_QC - and not args.AA_insert_sdevs - and prop_paired_proportion is not None - and prop_paired_proportion < 90 - ): - logging.info("Properly paired rate less than 90%, setting --insert_sdevs 9.0 for AA") - args.AA_insert_sdevs = 9.0 - - run_AA( - args.aa_python_interpreter, - amplified_interval_bed, - args.sorted_bam, - AA_outdir, - sname, - args.downsample, - args.ref, - args.AA_runmode, - args.AA_extendmode, - args.AA_insert_sdevs, - ) - tb = time.time() - timing_logfile.write("AmpliconArchitect:\t" + "{:.2f}".format(tb - ta) + "\n") - ta = tb - # Run AC - if args.run_AC: - AC_SRC = os.environ["AC_SRC"] - AC_outdir = outdir + sname + "_classification/" - if not os.path.exists(AC_outdir): - os.mkdir(AC_outdir) - - run_AC(AA_outdir, sname, args.ref, AC_outdir, AC_SRC) - - tb = time.time() - timing_logfile.write("AmpliconClassifier:\t" + "{:.2f}".format(tb - ta) + "\n") - - run_metadata_filename = save_run_metadata(outdir, sname, args, launchtime, commandstring) - - with open(sample_metadata_filename, "w") as fp: - json.dump(sample_info_dict, fp, indent=2) - - if args.run_AA and args.run_AC: - make_AC_table( - sname, - AC_outdir, - AC_SRC, - run_metadata_filename, - sample_metadata_filename, - sample_info_dict["sample_cnv_bed"], - ) - - else: - ta = time.time() - AC_SRC = os.environ["AC_SRC"] - AC_outdir = outdir + sname + "_classification/" - if not os.path.exists(AC_outdir): - os.mkdir(AC_outdir) - - run_AC(args.completed_AA_runs, sname, args.ref, AC_outdir, AC_SRC) - - tb = time.time() - timing_logfile.write("AmpliconClassifier:\t" + "{:.2f}".format(tb - ta) + "\n") - - with open(sample_metadata_filename, "w") as fp: - json.dump(sample_info_dict, fp, indent=2) - - make_AC_table(sname, AC_outdir, AC_SRC, args.completed_run_metadata, sample_metadata_filename) - - if not args.run_AA: - AA_outdir = None - - if not args.run_AC: - AC_outdir = None - - if not detect_run_failure(aln_stage_stderr, AA_outdir, sname, AC_outdir): - logging.info("\nAll stages appear to have completed successfully.") - with open(args.output_directory + args.sample_name + "_finish_flag.txt", "w") as ffof: - ffof.write("All stages completed\n") - - tf = time.time() - timing_logfile.write("Total_elapsed_walltime\t" + "{:.2f}".format(tf - ti) + "\n") - timing_logfile.close() diff --git a/bin/abstract_graph.py b/bin/abstract_graph.py deleted file mode 100755 index 916f82ef..00000000 --- a/bin/abstract_graph.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com - - -# This file defines classes and methods for an abstract undirected graph, vertex and edge. - - -import logging - - -class abstract_vertex(object): - """Class describing a graph vertex. - Attributes: - elist: List of abstract_edges - vid: (optional) ID for the abstract_vertex - graph: (optional) abstract_graph to which the vertex belongs""" - - def __init__(self, vid=0, graph=None): - """Initiate vertex with optional vid and graph""" - self.elist = [] - self.vid = vid # vertexid - self.graph = graph - if self.vid == 0 and self.graph is not None: - self.vid = self.graph.next_vid() - if self.graph is not None: - if vid in graph.vs: - raise Exception("Adding with duplicate vid") - self.graph.include_vertex(self) - - def neighbors(self): - """Return list of vertices connected to abstract_vertex by a direct edge""" - return [e.v2 for e in self.elist] - - def __hash__(self): - """Return hash based on vid to allow to efficiently check for presence of vid in graph, etc""" - return self.vid - - def __repr__(self): - """Vertex is represented by vid""" - return str(self.vid) - - -class abstract_edge(object): - """Class describing a graph edge. - Attributes: - v1, v2: Ordered pair of vertices connected by the edge - eid: (optional) ID for the abstract_edge - graph: (optional) abstract_graph to which the vertex belongs.""" - - def __init__(self, v1, v2, eid=0, graph=None, update_vertices=True): - """Initiate edge - Arguments: v1, v2, (optional)eid, (optional) graph. - update_vertices: (optional True/False) to update vertices to include edge in v1.elist, v2.elist. (default=True) - """ - self.v1, self.v2 = v1, v2 - self.eid = eid - self.graph = graph - if self.eid == 0 and self.graph is not None: - self.eid = self.graph.next_eid() - if self.graph is not None: - if eid in self.graph.es: - raise Exception("Adding edge with duplicate eid") - self.graph.include_edge(self) - if update_vertices: - if v1.graph is not v2.graph: - raise Exception("Adding edge between vertices of different graphs.") - if graph is not None and v1.graph is not graph: - raise Exception("Edge in different graph than vertex.") - if graph is None and v1.graph is not None: - graph = v1.graph - v1.elist.append(self) - v2.elist.append(self) - - def neighbor(self, v): - """Given a vertex, return its neighbor along the edge""" - if v == self.v1: - return self.v2 - if v == self.v2: - return self.v1 - raise Exception("Edge not connected to vertex") - - def __hash__(self): - """Return hash based on eid to allow to efficiently check for presence of eid in graph, etc""" - return self.eid - - def length(self): - """Not implemented""" - pass - - def __repr__(self): - """String representation of the form v1<->v2.""" - return str(self.v1) + "<->" + str(self.v2) - - -class abstract_graph(object): - """Class describing a graph. - Attributes: - vs: Dictionary from vid/key to vertex - es: Dictionary from eid/key to edge - max_vid: (internal) max_vid, used to assign vid for new vertex. Suggested to use function next_vid. - max_eid: (internal) max_eid, used to assign eid for new edge. Suggested to use function next_eid.""" - - def __init__(self): - """Initiate empty graph""" - self.es = {} # key -->edges - self.vs = {} # key -->vertices - # self.logger = logging.getLogger('Algae') - self.max_eid = 1 - self.max_vid = 1 - - def include_vertex(self, v): - """Include orphan abstract_vertex in graph and update vertex.graph to point to self""" - if v.vid in self.vs and self.vs[v.vid] is not v: - raise "Adding vertex with duplicate vid" - if v.graph is not None and v.graph is not self: - raise "Adding vertex from another graph" - if v.graph is None: - v.graph = self - self.vs[v.vid] = v - - def include_edge(self, e): - """Include orphan abstract_edge in graph and update edge.graph to point to self. Vertices should be updated separately""" - if e.eid in self.es and self.es[e.eid] is not e: - raise "Adding edge with duplicate eid" - if e.graph is not None and e.graph is not self: - raise "Adding edge from another graph" - if e.graph is None: - e.graph = self - self.es[e.eid] = e - - def next_eid(self): - """Find the next eid available for assignment to new edge""" - while self.max_eid in self.es or -1 * self.max_eid in self.es: - self.max_eid += 1 - return self.max_eid - - def next_vid(self): - """Find the next vid available for assignment to new vertex""" - while self.max_vid in self.vs or -1 * self.max_vid in self.vs: - self.max_vid += 1 - return self.max_vid diff --git a/bin/amplified_intervals.py b/bin/amplified_intervals.py deleted file mode 100755 index 3f9da206..00000000 --- a/bin/amplified_intervals.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Maintained by Jens Luebeck jluebeck@ucsd.edu -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - -import argparse -import logging -import os -import sys - -import numpy as np -import pysam - -import global_names - -sys.setrecursionlimit(10000) - -GAIN = 4.5 -CNSIZE_MIN = 50000 - - -parser = argparse.ArgumentParser(description="Filter and merge amplified intervals") -parser.add_argument( - "--bed", - dest="bed", - help="Input bed file with list of amplified intervals", - metavar="FILE", - action="store", - type=str, - required=True, -) -parser.add_argument( - "--out", - dest="out", - help="OPTIONAL: Prefix filename for output bed file. Default: _amplified.bed", - metavar="FILE", - action="store", - type=str, - default="", -) -parser.add_argument( - "--bam", - dest="bam", - help="OPTIONAL: Bamfile, used to avoid large aneuploidies", - metavar="FILE", - action="store", - type=str, - default="", -) -parser.add_argument( - "--gain", - dest="gain", - help="OPTIONAL: CN gain threshold for interval to be considered as a seed. Default: 5", - action="store", - type=float, - default=GAIN, -) -parser.add_argument( - "--cnsize_min", - dest="cnsize_min", - help="OPTIONAL: Minimum size (in bp) for interval to be considered as a seed. Default: 100000", - action="store", - type=int, - default=CNSIZE_MIN, -) -parser.add_argument( - "--ref", - dest="ref", - help='Values: [hg19, GRCh37, GRCh38, GRCh38_viral, mm10, GRCm38]. "hg19", "GRCh38", "mm10" : chr1, .. chrM etc / "GRCh37", "GRCm38" : \'1\', \'2\', .. \'MT\' etc/ "None" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected.', - metavar="STR", - action="store", - type=str, - choices=["hg19", "GRCh37", "GRCh38", "GRCh38_viral", "mm10", "GRCm38"], - required=True, -) -parser.add_argument( - "--no_cstats", - dest="no_cstats", - help="Do not re-use coverage statistics from coverage.stats.", - action="store_true", - default=False, -) - -args = parser.parse_args() - -global_names.REF = args.ref -import ref_util as hg - -if args.bed != "": - rdAlts = args.bed - -if args.out != "": - outname = args.out + ".bed" -else: - outname = os.path.splitext(rdAlts)[0] + "_amplified.bed" - -GAIN, CNSIZE_MIN = args.gain, args.cnsize_min - -rdList0 = hg.interval_list(rdAlts, "bed") -if rdList0: - try: - if len(rdList0[0].info) == 0: - logging.error( - "ERROR: CNV estimate bed file had too few columns.\n" "Must contain: chr pos1 pos2 cnv_estimate\n" - ) - sys.exit(1) - - _ = float(rdList0[0].info[-1]) - - except ValueError: - logging.error("ERROR: CNV estimates must be in last column of bed file.\n") - sys.exit(1) - -tempL = [] -for r in rdList0: - if args.ref == "GRCh38_viral" and not r.chrom.endswith("chr"): - tempL.append(r) - - elif float(r.info[-1]) > GAIN: - tempL.append(r) - -rdList = hg.interval_list(tempL) - -# rdList = hg.interval_list([r for r in rdList0 if float(r.info[-1]) > GAIN or (args.ref == "GRCh38_viral" and not r.chrom.endswith("chr"))]) - -if args.bam != "": - import bam_to_breakpoint as b2b - - if os.path.splitext(args.bam)[-1] == ".cram": - bamFile = pysam.Samfile(args.bam, "rc") - else: - bamFile = pysam.Samfile(args.bam, "rb") - - cstats = None - cb = bamFile - if os.path.exists(os.path.join(hg.DATA_REPO, "coverage.stats")) and not args.no_cstats: - coverage_stats_file = open(os.path.join(hg.DATA_REPO, "coverage.stats")) - for l in coverage_stats_file: - ll = l.strip().split() - if not ll: - continue - bamfile_pathname = str(cb.filename.decode()) - if ll[0] == os.path.abspath(bamfile_pathname): - bamfile_filesize = os.path.getsize(bamfile_pathname) - - cstats = tuple(map(float, ll[1:])) - if len(cstats) < 15 or cstats[13] != 3 or bamfile_filesize != int(cstats[14]) or any(np.isnan(cstats)): - cstats = None - - coverage_stats_file.close() - - bamFileb2b = b2b.bam_to_breakpoint(bamFile, coverage_stats=cstats) - pre_int_list = [] - for r in rdList: - try: - chrom_cov_ratio = bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0] - # print("chrom ratio " + r.chrom + " " + str(chrom_cov_ratio)) - if ( - float(r.info[-1]) - > GAIN + 2 * max(1.0, bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0]) - 2 - and bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0] > 0 - ): - if r.size() < 10000000 or float(r.info[-1]) > 1.5 * GAIN: - pre_int_list.append(r) - - elif float(r.info[-1]) > 1 and args.ref == "GRCh38_viral" and not r.chrom.startswith("chr"): - pre_int_list.append(r) - - except ZeroDivisionError: - logging.error("zero division error", r.chrom, args.ref, float(r.info[-1])) - - # if float(r.info[-1]) > 1 and args.ref == "GRCh38_viral" and not r.chrom.startswith("chr"): - # pre_int_list.append(r) - # - continue - - rdList = hg.interval_list(pre_int_list) - -amplicon_listl = rdList - -cr = hg.conserved_regions -uc_list = hg.interval_list([]) -for a in amplicon_listl: - if ( - len(hg.interval_list([a]).intersection(cr)) == 0 - or a.size() - > max(1000000, 10 * sum([a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr)])) - or a.size() - sum([a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr)]) > 2000000 - ): - if (len(hg.interval_list([a]).intersection(cr))) == 0: - uc_list.append(a) - else: - cra = hg.interval_list([a]).intersection(cr) - cpos = a.start - for crai in cra: - if cpos < crai[1].start - 1000000: - uc_list.append(hg.interval(a.chrom, cpos, crai[1].start - 1000000, info=a.info)) - cpos = crai[1].end + 1000000 - if a.end > cpos: - uc_list.append(hg.interval(a.chrom, cpos, a.end, info=a.info)) - -new_uc_list = [] -for a in uc_list: - if args.ref == "GRCh38_viral" and not a.chrom.startswith("chr"): - if a.rep_content() < 2.5: - new_uc_list.append(a) - else: - if float(a.info[-1]) * a.segdup_uniqueness() > GAIN and a.rep_content() < 2.5: - new_uc_list.append(a) - -uc_merge = hg.interval_list(new_uc_list).merge_clusters(extend=300000) - -with open(outname, "w") as outfile: - for a in uc_merge: - is_viral = False - if args.ref == "GRCh38_viral" and not a[0].chrom.startswith("chr"): - is_viral = True - - if sum([ai.size() for ai in a[1]]) > CNSIZE_MIN or is_viral: - outfile.write( - "\t".join( - [ - str(a[0]), - str(sum([ai.size() * float(ai.info[-1]) for ai in a[1]]) / sum([ai.size() for ai in a[1]])), - rdAlts, - ] - ) - + "\n" - ) diff --git a/bin/bam2bam.py b/bin/bam2bam.py old mode 100644 new mode 100755 diff --git a/bin/bam_to_breakpoint.py b/bin/bam_to_breakpoint.py deleted file mode 100755 index 0f72fe0d..00000000 --- a/bin/bam_to_breakpoint.py +++ /dev/null @@ -1,3682 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - -import itertools -from time import time -import pysam -import math -import copy -from collections import defaultdict -import mosek_solver -import sys -import numpy as np -from scipy import stats -import heapq -import os -import logging -import bisect -import matplotlib - -matplotlib.use("Agg") -import matplotlib.pyplot as plt -from matplotlib.patches import Ellipse, Rectangle, Arc -import matplotlib.ticker as ticker -from matplotlib import gridspec -import random -import re -from past.builtins import xrange -from functools import reduce - -from breakpoint_graph import * -import ref_util as hg -from mycolors import * -import global_names - - -# use Arial font if you have it. will fall back to default if not available. -matplotlib.rcParams["font.family"] = "sans-serif" -matplotlib.rcParams["font.sans-serif"] = ["Arial"] -matplotlib.rcParams["pdf.fonttype"] = 42 - -summary_logger = logging.getLogger("summary") -graph_logger = logging.getLogger("graph") -cycle_logger = logging.getLogger("cycle") - -# suppress some specific harmless numpy warnings during AA -np.seterr( - divide="ignore", - invalid="ignore", -) -TSTART = global_names.TSTART - - -class breakpoint_cluster: - def __init__(self, edge, bamfile, max_insert): - self.edge = edge - - -class bam_to_breakpoint: - def __init__( - self, - bamfile, - sample_name="", - read_length=100, - max_insert=400, - insert_size=300, - num_sdevs=3, - window_size=10000, - min_coverage=30, - pair_support=-1, - pair_support_min=2, - downsample=-1, - secondary_index=None, - coverage_stats=None, - coverage_windows=None, - sensitivems=False, - span_coverage=True, - tstart=0, - ): - self.bamfile = bamfile - self.sample_name = sample_name - self.window_size = window_size - self.min_coverage = min_coverage - self.max_insert = max_insert - self.insert_size = insert_size - self.num_sdevs = num_sdevs - self.read_length = read_length - self.secondary_index = secondary_index - self.gc_scale = defaultdict(lambda: 1.0) # self.gc_scaling() - self.gc_set = False - self.ms_window_size = 10000 - self.downsample = downsample - self.downsample_ratio = 1 - self.sensitivems = sensitivems - self.span_coverage = span_coverage - self.mapping_quality_cutoff = 5 - self.breakpoint_mapping_quality_cutoff = 20 - self.breakpoint_entropy_cutoff = 0.75 - self.pair_support_min = pair_support_min - hg.update_chrLen([(c["SN"], c["LN"]) for c in self.bamfile.header["SQ"]]) - self.discordant_edge_calls = {} - self.interval_coverage_calls = {} - self.tstart = tstart if tstart != 0 else TSTART - if coverage_stats is None: - self.basic_stats_set = False - self.median_coverage(window_list=coverage_windows) - else: - ( - wc_10000_median, - wc_10000_avg, - wc_10000_std, - wc_300_median, - wc_300_avg, - wc_300_std, - self.read_length, - self.insert_size, - self.insert_std, - self.min_insert, - self.max_insert, - self.pair_support, - self.percent_proper, - _, - _, - ) = coverage_stats - self.basic_stats = coverage_stats - self.basic_stats_set = True - r = coverage_stats - - if self.downsample < 0 or self.downsample > self.basic_stats[0]: - self.downsample_ratio = 1 - elif self.downsample == 0: - self.downsample_ratio = 10.0 / self.basic_stats[0] if self.basic_stats[0] > 10 else 1 - else: - self.downsample_ratio = ( - float(self.downsample) / self.basic_stats[0] if self.basic_stats[0] > float(self.downsample) else 1 - ) - - if self.downsample_ratio != 1: - rr = self.downsample_ratio - rsq = math.sqrt(rr) - r = [i[0] * i[1] for i in zip([rr, rr, rsq, rr, rr, rsq, 1, 1, 1, 1, 1, 1, 1], r)] - r[11] = max((r[4] / 10.0) * ((r[7] - r[6]) / 2 / r[6]) * r[12], 2) - self.pair_support = r[11] - self.downsample_stats = r - else: - self.downsample_stats = self.basic_stats - self.coverage_logs = {} - - if pair_support != -1: - self.pair_support = pair_support - - # Methods to find coverage and other statistics of bam file - - def fetch(self, c, s, e): - if s > e: - (s, e) = (e, s) - if s < 0: - s = 1 - if s > hg.chrLen[hg.chrNum(c)]: - s = hg.chrLen[hg.chrNum(c)] - 1 - e = hg.chrLen[hg.chrNum(c)] - 1 - if e < 0: - s = 1 - e = 1 - if e > hg.chrLen[hg.chrNum(c)]: - e = hg.chrLen[hg.chrNum(c)] - 1 - if self.downsample_ratio == 1: - for a in self.bamfile.fetch(c, s, e + 1): - yield a - else: - for a in self.bamfile.fetch(c, s, e + 1): - random.seed(a.query_name) - if random.uniform(0, 1) < self.downsample_ratio: - yield a - - def interval_coverage(self, i, clip=False, gcc=False): - call_args = (i.chrom, i.start, i.end, clip, gcc) - if call_args in self.interval_coverage_calls: - return self.interval_coverage_calls[call_args] - if gcc: - wc_raw = self.window_coverage(i) - wc_corrected = 0 - j = 0 - for w in wc_raw: - alist = [a for a in self.fetch(w[0].chrom, w[0].start, w[0].end)] - wc_corrected += w[0].size() * w[1] / self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0] - if ( - w[0].size() * w[1] / self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0] - > 10 * len(alist) * self.read_length - ): - print( - str(i).strip(), - str(w[0]).strip(), - wc_corrected, - len(alist), - w[1], - self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0], - w[0].gc_content(), - w[0].sequence(), - ) - j += 1 - if j > 100: - raise ValueError("j>100") - self.interval_coverage_calls[call_args] = wc_corrected / i.size() - return self.interval_coverage_calls[call_args] - s2 = i.start - e2 = i.end - if i.start < 0: - s2 = 0 - if i.end > hg.chrLen[hg.chrNum(i.chrom)]: - e2 = hg.chrLen[hg.chrNum(i.chrom)] - if s2 >= e2: - return 0 - - # if e2 - s2 >= window_size and clip == False and gcc == False: - # return len(alist) * self.read_length / float(e2 - s2) - # sumb = 0 - # if clip == True or (clip == False and i.size() >= 100 * self.read_length): - # return len(alist) * self.read_length / float(i.size()) - if clip == True or (clip is None and e2 - s2 <= 1000): - icc = ( - sum( - [ - sum(a) - for a in self.bamfile.count_coverage( - i.chrom, s2, e2, quality_threshold=self.mapping_quality_cutoff - ) - ] - ) - * self.downsample_ratio - / max(1.0, float(e2 - s2 + 1)) - ) - self.interval_coverage_calls[call_args] = icc - return self.interval_coverage_calls[call_args] - else: - alist_len = len( - [ - a - for a in self.fetch(i.chrom, s2, e2) - if not a.is_unmapped - and a.reference_end - 1 <= e2 - and a.mapping_quality > self.mapping_quality_cutoff - ] - ) - self.interval_coverage_calls[call_args] = alist_len * self.read_length / max(1.0, float(e2 - s2 + 1)) - return self.interval_coverage_calls[call_args] - - # Maintainer found this code block is unreachable - # for a in alist: - # ai = hg.interval(a, bamfile=self.bamfile).intersection(i) - # if ai is not None: - # sumb += ai.size() - # if sumb / float(i.size()) > 10 * len(alist) * self.read_length / float(i.size()): - # print(str(i), sumb, len(alist)) - # raise ValueError("isize exception") - # self.interval_coverage_calls[call_args] = sumb / float(i.size()) - # return self.interval_coverage_calls[call_args] - - def window_coverage_stats(self, i, window_size=-1, gcc=False): - if window_size == -1: - window_size = self.max_insert - self.read_length - j = range(i.start, i.end, window_size) - jj = [hg.interval(i.chrom, k, k + window_size) for k in j] - cc = [self.interval_coverage(k, gcc=gcc) for k in jj] - dd = [abs(cc[j + 1] - cc[j]) for j in range(len(jj) - 1)] - return (sum(cc) / len(cc), sum(dd) / len(dd)) - - def window_coverage(self, i, window_size=-1, gcc=False, clip=None, exact=True): - # print str(i) - if window_size == -1: - window_size = self.max_insert - self.read_length - - def win_breakup(i, window_size): - if exact: - (istart, iend) = (i.start, i.end) - else: - istart = window_size * int(round(float(i.start) / window_size)) - iend = window_size * int(round(float(i.end) / window_size)) - for k in xrange(istart, iend, window_size): - yield hg.interval(i.chrom, k, k + window_size - 1) - - for k in win_breakup(i, window_size): - yield (k, self.interval_coverage(k, gcc=gcc, clip=clip)) - # return [(k, self.interval_coverage(k, gcc=gcc)) for k in jj] - - def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): - if (window_size == 10000 or window_size == -1) and self.basic_stats_set and refi == -1: - return self.downsample_stats - if window_size == 300 and self.basic_stats_set and refi == -1: - return self.downsample_stats[3:6] - - num_iter = 1000 - iteri = 0 - chroffset = 0 - sumchrLen = sum([l for l in hg.chrLen.values()]) - if refi != -1: - if type(refi) == str: - sumchrLen = hg.chrLen[hg.chrNum(refi)] - chroffset = hg.absPos(refi, 1) - elif type(refi) == hg.interval: - if len([i for i in hg.centromere_list if i.chrom == refi.chrom]) == 0: - chr_cent = None - else: - chr_cent = [i for i in hg.centromere_list if i.chrom == refi.chrom][0] - if chr_cent is None: - sumchrLen = hg.chrLen[hg.chrNum(refi.chrom)] - chroffset = hg.absPos(refi.chrom, 1) - elif chr_cent.end > refi.end and chr_cent.start > refi.start: - sumchrLen = chr_cent.start - chroffset = hg.absPos(refi.chrom, 1) - elif chr_cent.start < refi.start and chr_cent.end < refi.end: - sumchrLen = hg.chrLen[hg.chrNum(refi.chrom)] - chr_cent.end - chroffset = hg.absPos(refi.chrom, 1) + chr_cent.end - else: - sumchrLen = hg.chrLen[hg.chrNum(refi.chrom)] - chroffset = hg.absPos(refi.chrom, 1) - # if hg.chrPos(chroffset) is None: - if refi != -1: - cp = hg.chrPos(chroffset) - if cp is not None: - ii = hg.interval(cp[0], cp[1], cp[1] + sumchrLen) - unconserved_len = sumchrLen - sum( - [i[0].intersection(i[1]).size() for i in hg.interval_list([ii]).intersection(hg.conserved_regions)] - ) - if (sumchrLen < 1000000 or (refi != -1 and unconserved_len < 1000000)) and window_size == -1: - return self.downsample_stats - - elif (sumchrLen < 1000000) and window_size == -1: - return self.downsample_stats - - if (refi != -1 or window_size != -1) and (chroffset, sumchrLen, window_size) in self.coverage_logs: - return self.coverage_logs[(chroffset, sumchrLen, window_size)] - # logging.info("Calculating median arm coverage " + str(refi) + " " + str(window_size)) - - if not self.basic_stats_set: - read_length = [] - insert_size = [] - window_list_index = 0 - non_mapping = 0 - random.seed(global_names.SEED) - while (window_list is not None and window_list_index < len(window_list)) or ( - window_list is None and iteri <= num_iter - ): - if window_list is None: - newpos = int(random.random() * sumchrLen) + chroffset - else: - cwindow = window_list[window_list_index] - window_list_index += 1 - if cwindow.end - cwindow.start < 10000: - continue - newpos = hg.absPos(cwindow.chrom, ((cwindow.end + cwindow.start) / 2) - 5000) - if hg.chrPos(newpos) is None: - logging.debug( - "Unable to locate reference position: " - + refi.chrom - + " " - + str(refi.start) - + " " - + str(refi.end) - + " " - + str(newpos) - + " " - + str(sumchrLen) - ) - iteri += 1 - continue - (c, p) = hg.chrPos(newpos) - if ( - c not in self.bamfile.references - or p < 10000 - or hg.chrLen[hg.chrNum(c)] < p + 10000 - or len( - hg.interval_list([hg.interval(c, p, p + 10000)]).intersection( - hg.conserved_regions, extend=10000 - ) - ) - > 0 - or len( - hg.interval_list([hg.interval(c, p, p + 10000)]).intersection(hg.centromere_list, extend=10000) - ) - > 0 - ): - continue - read_length += [ - a.infer_query_length(always=False) for a in self.fetch(c, p, p + 10000) if not a.is_unmapped - ] - insert_size += [ - a.template_length - for a in self.fetch(c, p, p + 10000) - if a.is_proper_pair and not a.is_reverse and a.template_length < 10000 and a.template_length > 0 - ] - iteri += 1 - self.read_length = np.average(read_length) - self.insert_size = np.average(insert_size) - percent_proper = len(insert_size) * 2.0 / (len(read_length) + non_mapping) - self.percent_proper = percent_proper - self.insert_std = np.std(insert_size) - self.max_insert = self.insert_size + self.num_sdevs * self.insert_std - self.min_insert = max(0, self.insert_size - self.num_sdevs * self.insert_std) - - if window_size not in [-1, 300, 10000]: - ws_list = [window_size] - else: - ws_list = [10000, 300] - - wc_median = [] - wc_avg = [] - wc_std = [] - random.seed(global_names.SEED) - for ws in ws_list: - wc_ws = [] - iteri = 0 - window_list_index = 0 - while (window_list is not None and window_list_index < len(window_list)) or ( - window_list is None and iteri <= num_iter - ): - if window_list is None: - newpos = int(random.random() * sumchrLen) + chroffset - else: - cwindow = window_list[window_list_index] - window_list_index += 1 - if cwindow.end - cwindow.start < 10000: - continue - newpos = hg.absPos(cwindow.chrom, ((cwindow.end + cwindow.start) / 2) - 5000) - if hg.chrPos(newpos) is None: - logging.warning( - "Unable to locate reference position: " - + refi.chrom - + " " - + str(refi.start) - + " " - + str(refi.end) - + " " - + str(newpos) - + " " - + str(sumchrLen) - ) - iteri += 1 - continue - (c, p) = hg.chrPos(newpos) - if ( - c not in self.bamfile.references - or p < ws - or hg.chrLen[hg.chrNum(c)] < p + ws - or len(hg.interval_list([hg.interval(c, p, p + ws)]).intersection(hg.conserved_regions, extend=ws)) - > 0 - or len(hg.interval_list([hg.interval(c, p, p + ws)]).intersection(hg.centromere_list, extend=ws)) - > 0 - ): - continue - wc_ws.append(self.interval_coverage(hg.interval(c, p, p + ws), gcc=gcc)) - iteri += 1 - wc_ws.sort() - wc_ws_median = np.median(wc_ws) - wc_ws_filter = [c for c in wc_ws if c < 5 * wc_ws_median and c > 0] - if len(wc_ws_filter) == 0: - print(len(wc_ws_filter), len(wc_ws), len([c for c in wc_ws if c > 0]), wc_ws_median) - wc_median.append(0) - wc_avg.append(0) - wc_std.append(0) - else: - wc_median.append(wc_ws_filter[len(wc_ws_filter) // 2]) - wc_avg.append(np.average(wc_ws_filter)) - wc_std.append(np.std(wc_ws_filter)) - - if window_size not in [-1, 300, 10000] or refi != -1: - self.coverage_logs[(chroffset, sumchrLen, window_size)] = (wc_median[0], wc_avg[0], wc_std[0]) - return (wc_median[0], wc_avg[0], wc_std[0]) - - (wc_10000_median, wc_10000_avg, wc_10000_std) = (wc_median[0], wc_avg[0], wc_std[0]) - (wc_300_median, wc_300_avg, wc_300_std) = (wc_median[1], wc_avg[1], wc_std[1]) - bamfile_pathname = str(self.bamfile.filename.decode()) - bamfile_filesize = os.path.getsize(bamfile_pathname) - self.pair_support = max( - int( - round( - (wc_300_avg / 10.0) - * ((self.insert_size - self.read_length) / 2 / self.read_length) - * self.percent_proper - ) - ), - self.pair_support_min, - ) - rstats = ( - wc_10000_median, - wc_10000_avg, - wc_10000_std, - wc_300_median, - wc_300_avg, - wc_300_std, - self.read_length, - self.insert_size, - self.insert_std, - self.min_insert, - self.max_insert, - self.pair_support, - self.percent_proper, - self.num_sdevs, - bamfile_filesize, - ) - if refi == -1: - self.basic_stats = rstats - self.basic_stats_set = True - print( - "read length:", - self.read_length, - "insert size:", - self.insert_size, - "insert std dev:", - self.insert_std, - "max_insert:", - self.max_insert, - "percent proper:", - percent_proper, - "num_sdevs", - self.num_sdevs, - ) - print("coverage stats", self.basic_stats, len(wc_ws_filter)) - print("pair support", self.pair_support) - coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats", "a") - coverage_stats_file.write( - os.path.abspath(self.bamfile.filename.decode("utf-8")) + "\t" + "\t".join(map(str, rstats)) + "\n" - ) - coverage_stats_file.close() - - r = rstats - if self.downsample < 0 or self.downsample > self.basic_stats[0]: - self.downsample_ratio = 1 - elif self.downsample == 0: - self.downsample_ratio = 10.0 / self.basic_stats[0] if self.basic_stats[0] > 10 else 1 - else: - self.downsample_ratio = ( - float(self.downsample) / self.basic_stats[0] if self.basic_stats[0] > float(self.downsample) else 1 - ) - if self.downsample_ratio != 1: - rr = self.downsample_ratio - rsq = math.sqrt(rr) - r = [i[0] * i[1] for i in zip([rr, rr, rsq, rr, rr, rsq, 1, 1, 1, 1, 1, 1, 1, 1, 1], r)] - r[11] = max((r[4] / 10.0) * ((r[7] - r[6]) / 2 / r[6]) * r[12], 2) - self.pair_support = r[11] - self.downsample_stats = r - - else: - self.downsample_stats = self.basic_stats - - return rstats - - def gc_scaling(self): - if self.gc_set: - return self.gc_scale - gc_total_rd = {i / 10.0: 0 for i in range(11)} - gc_num_windows = {i / 10.0: 0 for i in range(11)} - for ri in range(len(self.bamfile.references)): - # print self.bamfile.references[ri] - # print hg.chrLen - if hg.chrNum(self.bamfile.references[ri]) not in hg.chrLen: - continue - # print self.bamfile.references[ri] - wc = self.window_coverage(hg.interval(self.bamfile.references[ri], 0, self.bamfile.lengths[ri])) - lwc = 0 - for w in wc: - lwc += 1 - gc_total_rd[int(w[0].gc_content() * 10.0) / 10.0] += w[1] - gc_num_windows[int(w[0].gc_content() * 10.0) / 10.0] += 1 - # print gc_num_windows, gc_total_rd, lwc - break - sc_factor = sum(gc_total_rd.values()) / sum(gc_num_windows.values()) - scale = {} - for i in gc_total_rd: - if gc_num_windows[i] == 0: - scale[i] = 1.0 - else: - scale[i] = gc_total_rd[i] / gc_num_windows[i] / sc_factor - self.gc_scale = scale - self.gc_set = True - logging.debug("GC scale:", scale) - return scale - - # Methods to find all coverage shifts in amplicon - def meanshift(self, i, window_size=-1, hb=2, cov=None, rd_global=-1, h0=-1, gcc=False, n=-1): - if window_size == -1: - window_size = self.max_insert - self.read_length - if rd_global == -1: - rd_global = self.median_coverage(window_size, gcc)[0] - if h0 == -1: - h0 = self.median_coverage(window_size, gcc)[2] - if rd_global == 0: - rd_global = self.median_coverage()[0] - h0 = self.median_coverage()[2] - if n == -1: - n = min(max(100, 10 * hb), 10000000 // window_size) - j = range(len(cov)) - if cov is None: - s2 = i.start - window_size * n - e2 = i.end + window_size * n - if s2 < 0: - s2 = 0 - if e2 > hg.chrLen[hg.chrNum(i.chrom)]: - e2 = hg.chrLen[hg.chrNum(i.chrom)] - j = range(s2, e2, window_size) - # j = range(i.start, i.end, window_size) - # jj = [hg.interval(i.chrom, k, k + window_size) for k in j] - i2 = hg.interval(i.chrom, s2, e2) - cov = [c for c in self.window_coverage(i2, window_size, gcc, exact=False)] - - # cov = [self.interval_coverage(k) for k in jj] - # print window_size, len(cov), str(cov[0][0]).strip(), cov[0][1], str(cov[1][0]).strip(), cov[1][1] - def hr(wi): - if cov[wi][1] < rd_global / 4.0: - return h0 / 2.0 - else: - # return math.sqrt(cov[wi][1] * self.read_length / window_size) - return math.sqrt(cov[wi][1] / rd_global) * h0 - - dfi = [ - ( - cov[wi][0], - sum( - [ - wj - * math.exp(-0.5 * wj**2 / hb**2) - * math.exp(-0.5 * (cov[wi + wj][1] - cov[wi][1]) ** 2 / hr(wi) ** 2) - for wj in range(-1 * n, n + 1) - ] - ), - ) - for wi in range(n, len(j) - n) - if cov[wi][0] is not None - ] - # print 'meanshift', str(i), len(cov), len(dfi), len([c for c in cov if c[0] is not None]), [(str(c[0][0]), c[0][1], c[1][1]) for c in zip([cc for cc in cov if cc[0] is not None], dfi) ] - # [(interval,ms)] - return dfi - - def meanshift_pval(self, s1, s2): - if len(s1) <= 1 and len(s2) <= 1: - return 1.0 - if len(s1) > 1 and len(s2) > 1: - return stats.ttest_ind(s1, s2, equal_var=False)[1] - elif len(s1) == 1: - zscore = abs(s1[0] - np.average(s1 + s2)) / np.std(s1 + s2) - return stats.norm.sf(zscore) - elif len(s2) == 1: - zscore = abs(s2[0] - np.average(s1 + s2)) / np.std(s1 + s2) - return stats.norm.sf(zscore) - return 1.0 - - def meanshift_segmentation(self, i, window_size=-1, gcc=False, pvalue=0.01): - logging.debug("Computing meanshift segmentation on " + str(i)) - if window_size == -1: - window_size = 10000 - i = hg.interval( - i.chrom, - window_size * int(round(float(i.start) / window_size)), - window_size * int(round(float(i.end) / window_size)), - ) - mc = self.median_coverage(window_size, gcc) - rd_global = mc[0] - h0 = mc[2] - hb_profile = [2, 5, 10, 50, 100] - # hb_profile = [2] - n = min(max(100, 10 * hb_profile[-1]), 10000000 // window_size) # number of windows used to calculate meanshift - logging.debug("MS: " + str(i) + " window_size, n: " + str((window_size, n))) - s2 = i.start - window_size * n - e2 = i.end + window_size * n - logging.debug("MS: " + str(i) + " s2, e2: " + str((s2, e2))) - startskip = 0 - endskip = 0 - if s2 < 0: - s2 = i.start % window_size - startskip = n - (i.start - s2) // window_size - if e2 > hg.chrLen[hg.chrNum(i.chrom)]: - hgl = hg.chrLen[hg.chrNum(i.chrom)] - e2 = hgl - (hgl - i.end) % window_size - endskip = n - (hgl - i.end) // window_size - - i2 = hg.interval(i.chrom, s2, e2) - logging.debug("MS: " + str(i) + " startskip,endskip" + str((startskip, endskip))) - cov = [c for c in self.window_coverage(i2, window_size, gcc, exact=False)] - cov = [(None, 0) for ni in range(startskip)] + cov + [(None, 0) for ni in range(endskip)] - frozen = [] - - def hr(c, wlen): - if c < rd_global / 4.0: - return h0 / 2.0 - else: - return math.sqrt(c / rd_global) * h0 - - for hb in hb_profile: - cov2 = copy.copy(cov) - for ms_iterate in range(1): - fi = -1 - if len(frozen) > 0: - fi = 0 - ms = [w for w in self.meanshift(i, window_size, hb, cov2, rd_global=rd_global, h0=h0, gcc=gcc, n=n)] - segs = [] - new_seg = [] - msi = 0 - # print 'THIS0', len(frozen), fi #, frozen[0][0][0].start - # for ff in range(len(frozen)): - # print "THIS", ff, frozen[ff][0][0][0].start - while msi < len(ms): - if fi >= 0 and fi < len(frozen) and ms[msi][0].start == frozen[fi][0][0][0].start: - if len(new_seg) > 0 and (frozen[fi][1] % 2 == 1 or (ms[msi][1] > 0 and ms[msi - 1][1] <= 0)): - segs.append(new_seg) - new_seg = ms[msi : msi + len(frozen[fi][0])] - else: - new_seg += ms[msi : msi + len(frozen[fi][0])] - # segs.append(ms[msi: msi + len(frozen[fi][0])]) - msi += len(frozen[fi][0]) - fi += 1 - continue - elif ms[msi][1] > 0 and ms[msi - 1][1] <= 0 and len(new_seg) > 0: - segs.append(new_seg) - new_seg = [] - new_seg.append(ms[msi]) - msi += 1 - if len(new_seg) > 0: - segs.append(new_seg) - cov2 = copy.copy(cov[:n]) - covi = n - for msi in range(len(segs)): - s = segs[msi] - c = np.average([cc[1] for cc in cov[covi : covi + len(s)]]) - cov2 += [(ss[0], c) for ss in segs[msi]] - covi += len(segs[msi]) - cov2 += cov[-n:] - ci = n - frozen = [] - cpi = n - for si in range(len(segs)): - c = cov2[ci][1] - # c0 = cov[ci][1] - # lseg = segs[si][-1][0].end - segs[si][0][0].start - freeze = 0 - # if segs[si][0][0].start < 54857402 and segs[si][-1][0].end > 54857402: - # print (segs[si][0][0].start, segs[si][-1][0].end), (segs[si-1][0][0].start, segs[si-1][-1][0].end), (segs[si+1][0][0].start, segs[si+1][-1][0].end) - # print stats.ttest_ind([cc[1] for cc in cov[ci:ci + len(segs[si])]], [cs[1] for cs in cov[ci - len (segs[si - 1]):ci]], equal_var=False) - # print abs(cp - c), 3 * math.sqrt(max(cp, c) / rd_global) * h0 - # print abs(cn - c), 3 * math.sqrt(max(cn, c) / rd_global) * h0 - # print [cs[1] for cs in cov[ci - len (segs[si - 1]):ci]] - # print [cc[1] for cc in cov[ci:ci + len(segs[si])]] - # print [cs[1] for cs in cov[ci + len (segs[si]):ci + len(segs[si]) + len(segs[si + 1])]] - if si > 0: - if len(segs[si]) < 15 or len(segs[si - 1]) < 15: - cp = cov2[ci - 1][1] - if abs(cp - c) > 3 * math.sqrt(max(cp, c) / rd_global) * h0: - # if abs(cp - c) > 2 * hr(c, window_size * len(segs[si])): - freeze |= 1 - if len(segs[si]) > 1 and len(segs[si - 1]) > 1: - if ( - self.meanshift_pval( - [cc[1] for cc in cov[ci : ci + len(segs[si])]], - [cs[1] for cs in cov[ci - len(segs[si - 1]) : ci]], - ) - < pvalue - ): - freeze |= 1 - if si < len(segs) - 1: - if len(segs[si]) < 15 or len(segs[si + 1]) < 15: - cn = cov2[ci + len(segs[si])][1] - if abs(cn - c) > 3 * math.sqrt(max(cn, c) / rd_global) * h0: - # if abs(cn - c) > 2 * hr(c, window_size * len(segs[si])): - freeze |= 2 - if ( - self.meanshift_pval( - [cc[1] for cc in cov[ci : ci + len(segs[si])]], - [cs[1] for cs in cov[ci + len(segs[si]) : ci + len(segs[si]) + len(segs[si + 1])]], - ) - < pvalue - ): - freeze |= 2 - # if freeze > 0: - frozen.append((segs[si], freeze, c, cov2[cpi : ci + len(segs[si])], cov[cpi : ci + len(segs[si])])) - ci += len(segs[si]) - if freeze > 0: - cpi = ci - # for f in frozen: - # print str(hg.interval(f[0][0][0].chrom, f[0][0][0].start, f[0][-1][0].end)), f[1], f[2], str(i) - # print '----...-------------...------------...-------------...-----------------...----------------------' - # (list of windows[(windowinterval,ms)], left%2/right%4freeze, avg_coverage) - - plist = [] - ms1list = [] - ms2list = [] - cms = [] - c1list = [] - c2list = [] - for msi in range(len(frozen)): - cms.append(frozen[msi]) - if frozen[msi][1] % 4 >= 2: - plist.append(frozen[msi][0][-1][0].end) - avgc = np.average(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) - ms1list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c1list.append(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) - if len(ms1list) > 1: - ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c2list.append((reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], []))) - cms = [] - if len(cms) > 0: - avgc = np.average(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) - ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c2list.append((reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], []))) - - shifts = list(zip(plist, ms1list, ms2list, c1list, c2list)) - - # for a in shifts: - # print a[0], a[1], a[2], len(a[3]), len(a[4]), str(i) - # print '---------------------------------------------------------' - if len(shifts) > 0: - merge = True - else: - merge = False - while merge: - shifts = list(shifts) - merge = False - mergelist = [] - for shiftsi in range(len(shifts)): - s3 = [shifts[shiftsi][3], shifts[shiftsi][3][1:], shifts[shiftsi][3][:-1], shifts[shiftsi][3][1:-1]] - s4 = [shifts[shiftsi][4], shifts[shiftsi][4][1:], shifts[shiftsi][4][:-1], shifts[shiftsi][4][1:-1]] - min_ttest_val = 1.0 - for s3i in s3: - for s4i in s4: - p = self.meanshift_pval(s3i, s4i) - min_ttest_val = min(min_ttest_val, p) - if min_ttest_val > pvalue: - mergelist.append(shiftsi) - if len(mergelist) > 0: - merge = True - plist = [] - ms1list = [] - ms2list = [] - c1list = [] - c2list = [] - c1 = [] - for shiftsi in range(len(shifts)): - c1.extend(shifts[shiftsi][3]) - if shiftsi not in mergelist: - plist.append(shifts[shiftsi][0]) - avgc = np.average(c1) - ms1list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c1list.append(c1) - if len(plist) > 1: - c2list.append(c1) - ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c1 = [] - if len(plist) > 0: - c1.extend(shifts[-1][4]) - avgc = np.average(c1) - ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c2list.append(c1) - shifts = zip(plist, ms1list, ms2list, c1list, c2list) - # for a in shifts: - # print a[0], a[1], a[2], len(a[3]), len(a[4]) - # print '---------------------------------------------------------' - if self.sensitivems: - shifts_select = [s for s in shifts if abs(s[2] - s[1]) >= 1] - else: - shifts_select = [ - s for s in shifts if abs(s[2] - s[1]) >= max(1, min(max(s[2], s[1]) / 10.0, math.sqrt(max(s[2], s[1])))) - ] - if len(shifts_select) == 0: - return hg.interval_list( - [ - hg.interval( - i.chrom, - i.start, - i.end, - info={ - "cn": np.average([c[1] for c in cov[n:-n]]) * 2 / self.median_coverage(window_size, gcc)[0] - }, - ) - ] - ) - else: - shift_intervals = hg.interval_list([]) - start = i.start - for si in shifts_select: - shift_intervals.append(hg.interval(i.chrom, start, si[0], info={"cn": si[1]})) - start = si[0] + 1 - shift_intervals.append(hg.interval(i.chrom, start, i.end, info={"cn": shifts_select[-1][2]})) - return shift_intervals - - def meanshift_refined(self, i, window_size0=10000, window_size1=300, gcc=False, shifts_unrefined=None): - logging.debug("Meanshift refining " + i.chrom + ":" + str(i.start) + "-" + str(i.end)) - if hg.chrLen[hg.chrNum(i.chrom)] < 3 * window_size0: - logging.debug("small chrom") - ms_ws1 = self.meanshift_segmentation(i, window_size1, gcc) - for ii in ms_ws1: - ii.info["start_refined"] = True - ii.info["end_refined"] = True - logging.debug(str((ii.start, ii.end, ii.info["cn"]))) - return ms_ws1 - if shifts_unrefined is None: - shifts0 = self.meanshift_segmentation(i, window_size0, gcc, pvalue=0.0027) - else: - shifts0 = shifts_unrefined - - shift1_intervals = hg.interval_list(hg.interval(msi.chrom, msi.end, msi.end) for msi in shifts0[:-1]) - shift1_intervals = [msi[0] for msi in shift1_intervals.merge_clusters(extend=3 * window_size0)] - shifts1 = reduce( - lambda x, y: x + y, - [ - self.meanshift_segmentation( - hg.interval(i.chrom, s.start - 3 * window_size0, s.start + 3 * window_size0), - window_size1, - gcc, - pvalue=0.05, - ) - for s in shift1_intervals - ], - [], - ) - - matched_shifts = [] - prev_end = None - for s0i in range(len(shifts0[:-1])): - cndiff0 = shifts0[s0i + 1].info["cn"] - shifts0[s0i].info["cn"] - bests1i = None - bestscore = 0 - for s1i in range(len(shifts1) - 1): - if shifts1[s1i].end < i.start or shifts1[s1i].end >= i.end: - continue - if abs(shifts0[s0i].end - shifts1[s1i].end) >= window_size0: - continue - cndiff1 = shifts1[s1i + 1].info["cn"] - shifts1[s1i].info["cn"] - if cndiff0 * cndiff1 < 0 or cndiff0 / cndiff1 <= 0.5 and cndiff0 / cndiff1 >= 2: - continue - if bests1i is None: - bests1i = s1i - bestscore = abs(cndiff0 - cndiff1) - elif abs(cndiff0 - cndiff1) < bestscore: - bestscore = abs(cndiff0 - cndiff1) - bests1i = s1i - best_start = prev_end + 1 if prev_end is not None else shifts0[s0i].start - best_end = shifts1[bests1i].end if bests1i is not None else shifts0[s0i].end - matched_shifts.append( - hg.interval( - i.chrom, - best_start, - best_end, - info={ - "cn": shifts0[s0i].info["cn"], - "start_refined": prev_end is not None, - "end_refined": bests1i is not None, - }, - ) - ) - prev_end = shifts1[bests1i].end if bests1i is not None else None - if len(shifts0) > 1: - s0i = -1 - best_start = prev_end + 1 if prev_end is not None else shifts0[s0i].start - best_end = shifts0[s0i].end - matched_shifts.append( - hg.interval( - i.chrom, - best_start, - best_end, - info={"cn": shifts0[s0i].info["cn"], "start_refined": prev_end is not None, "end_refined": False}, - ) - ) - else: - matched_shifts.append( - hg.interval( - i.chrom, - i.start, - i.end, - info={"cn": shifts0[0].info["cn"], "start_refined": False, "end_refined": False}, - ) - ) - - for ii in matched_shifts: - logging.debug(str((ii.start, ii.end, ii.info["cn"]))) - - return matched_shifts - - def get_meanshift(self, i, window_size0=10000, window_size1=300, gcc=False): - logging.debug("get_meanshift on " + str(i)) - file_name = "%s_%s_%s_%s_cnseg.txt" % (self.sample_name, i.chrom, i.start, i.end) - if os.path.exists(file_name) and i.end - i.start > 50000: - logging.debug("Re-using cn-seg info in " + file_name) - msfile = open(file_name) - msr = [] - for line in msfile: - if len(line) == 0 or line[0] == "#": - continue - ll = line.strip().split() - msi = hg.interval( - str(ll[0]), - int(ll[1]), - int(ll[2]), - info={"cn": float(ll[3]), "start_refined": bool(ll[4]), "end_refined": bool(ll[5])}, - ) - msr.append(msi) - else: - msr = self.meanshift_refined(i, window_size0=window_size0, window_size1=window_size1, gcc=gcc) - msfile = open(file_name, "w") - msfile.write("#chrom\tstart\tend\tcn\tstart_refined\tend_refined\n") - for ms in msr: - msfile.write( - "%s\t%s\t%s\t%s\t%s\t%s\n" - % (ms.chrom, ms.start, ms.end, ms.info["cn"], ms.info["start_refined"], ms.info["end_refined"]) - ) - msfile.close() - return msr - - def interval_crossing_arcs(self, chrom, start, end, strand, ilist): - if strand == -1: - return [ - a - for a in self.fetch(chrom, max(0, start), min(end, hg.chrLen[hg.chrNum(chrom)])) - if not a.is_unmapped - and a.is_reverse - and ( - a.mate_is_unmapped - or a.next_reference_id == -1 - or len( - ilist.intersection( - [hg.interval(a.next_reference_name, a.next_reference_start, a.next_reference_start)] - ) - ) - == 0 - ) - ] - else: - return [ - a - for a in self.fetch(chrom, max(0, start), min(end, hg.chrLen[hg.chrNum(chrom)])) - if not a.is_unmapped - and not a.is_reverse - and ( - a.mate_is_unmapped - or a.next_reference_id == -1 - or len( - ilist.intersection( - [hg.interval(a.next_reference_name, a.next_reference_start, a.next_reference_start)] - ) - ) - == 0 - ) - ] - - # Methods to find breakpoint edges in amplicon - def get_mates(self, a): - gmt = time() - self.get_mates_num_calls += 1 - try: - miter = self.secondary_index.find(a.query_name) - retval = [m for m in miter if m.is_read1 != a.is_read1] - self.get_mates_time += time() - gmt - return retval - except: - # print time(), 'get_mates', str(a) - retval = [ - a2 - for a2 in self.fetch(a.next_reference_name, a.next_reference_start, a.next_reference_start + 1) - if a2.query_name == a.query_name and a2.is_read1 != a.is_read1 - ] - # retval = [self.bamfile.mate(a)] - # print time(), 'got_mates' - self.get_mates_time += time() - gmt - return retval - - def pair_support_count(self, chrom, position, strand, meanshift, foldup=False, sensitivems=True): - # str(hg.interval(f[0][0][0].chrom, f[0][0][0].start, f[0][-1][0].end)), f[1], f[2] - cd = 1 - for fi in range(len(meanshift)): - f = meanshift[fi] - if len(f) == 0: - continue - if not hg.interval(f[0].chrom, f[0].start, f[-1].end).intersects( - hg.interval(chrom, position, position), extend=self.ms_window_size - ): - continue - for pi in range(len(f)): - if f[pi].start + self.ms_window_size >= position: - break - # pi = bisect.bisect_left(f[1], (position,)) - if ( - pi > 0 - and pi < len(f) - and f[pi].start - self.ms_window_size <= position - and (f[pi].info["cn"] - f[pi - 1].info["cn"]) / strand > 0 - ): - cd = abs(f[pi].info["cn"] - f[pi - 1].info["cn"]) - elif pi > 0: - cd = f[pi - 1].info["cn"] - else: - cd = f[0].info["cn"] - mc = self.median_coverage() - cd = max(1, cd) - if self.sensitivems and sensitivems: - cd = min(cd, 10) - pcount = max(mc[4] * cd / 20.0 * ((self.insert_size - self.read_length) / 2 / self.read_length) * mc[12], 2) - pmincount = mc[11] - if pcount < mc[11]: - pcount = pmincount - return pcount - - def concordant_edge(self, v, bp_margin=0): - if v.pos == 0: - return (None, []) - elif v.strand == 1: - dlist = [ - a - for a in self.fetch(v.chrom, max(1, v.pos - self.max_insert), v.pos) - if not a.is_unmapped - and not a.is_reverse - and a.is_proper_pair - and a.next_reference_name == v.chrom - and a.next_reference_start >= v.pos - and a.reference_start < v.pos - bp_margin - and a.next_reference_start < a.reference_start + self.max_insert - self.read_length - ] - if len(dlist) > self.pair_support: - v2 = breakpoint_vertex(v.chrom, max(v.pos + 1, min([a.next_reference_start for a in dlist])), -1) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " " + str(len(dlist)) - ) - return (breakpoint_edge(v, v2), dlist) - else: - dlist = [ - a - for a in self.fetch(v.chrom, max(1, v.pos - self.max_insert), v.pos) - if not a.is_reverse - and a.is_proper_pair - and not a.is_unmapped - and a.next_reference_name == v.chrom - and a.next_reference_start >= v.pos - and a.reference_start < v.pos - bp_margin - and a.next_reference_start < a.reference_start + self.max_insert - self.read_length - ] - if len(dlist) > self.pair_support: - v2 = breakpoint_vertex(v.chrom, min(v.pos - 1, max([a.reference_end - 1 for a in dlist])), 1) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " " + str(len(dlist)) - ) - return (breakpoint_edge(v, v2), dlist) - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " not found") - return (None, dlist) - - def foldup_count(self, chrom, position, strand, cdiff=-1): - interval = hg.interval( - chrom, - max(1, position - self.ms_window_size), - min(hg.chrLen[hg.chrNum(chrom)], position + self.ms_window_size), - ) - if strand == 1: - dlist = [ - a - for a in self.fetch(interval.chrom, interval.start, interval.end) - if not a.is_unmapped - and not a.is_reverse - and a.is_paired - and not a.is_proper_pair - and not a.mate_is_unmapped - and not a.mate_is_reverse - and a.reference_name == a.next_reference_name - and abs(a.next_reference_start - a.reference_start) < 100000 - ] # self.ms_window_size] - else: - dlist = [ - a - for a in self.fetch(interval.chrom, interval.start, interval.end) - if not a.is_unmapped - and a.is_reverse - and a.is_paired - and not a.is_proper_pair - and not a.mate_is_unmapped - and a.mate_is_reverse - and a.reference_name == a.next_reference_name - and abs(a.next_reference_start - a.reference_start) < 100000 - ] # self.ms_window_size] - return len(dlist) - - def refine_discordant_edge(self, e): - # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " refine discordant edge " + str(e)) - v1min = max(0, (e.v1.pos - self.max_insert + self.read_length if e.v1.strand == 1 else e.v1.pos) - 1) - v2min = max(0, (e.v2.pos - self.max_insert + self.read_length if e.v2.strand == 1 else e.v2.pos) - 1) - v1max = ( - min( - e.v1.pos + self.max_insert - self.read_length if e.v1.strand == -1 else e.v1.pos, - hg.chrLen[hg.chrNum(e.v1.chrom)], - ) - - 1 - ) - v2max = ( - min( - e.v2.pos + self.max_insert - self.read_length if e.v2.strand == -1 else e.v2.pos, - hg.chrLen[hg.chrNum(e.v2.chrom)], - ) - - 1 - ) - d1list = [a for a in self.fetch(e.v1.chrom, v1min, v1max) if not a.is_unmapped] - d2list = [a for a in self.fetch(e.v2.chrom, v2min, v2max) if not a.is_unmapped] - d1Set = set([(a.query_name, a.is_read1, a.is_reverse, a.is_secondary) for a in d1list]) - if e.v1.strand == e.v2.strand: - d2Set = set([(a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary) for a in d2list]) - else: - d2Set = set([(a.query_name, a.is_read1, a.is_reverse, not a.is_secondary) for a in d2list]) - rSet = d1Set.intersection(d2Set) - if len(rSet) == 0: - return (e, 0, [], None) - multi_r = set() - d1reads = {} - d2reads = {} - for a in d1list: - if (a.query_name, a.is_read1, a.is_reverse, a.is_secondary) in d1reads: - multi_r.add((a.query_name, a.is_read1, a.is_reverse, a.is_secondary)) - d1reads[(a.query_name, a.is_read1, a.is_reverse, a.is_secondary)] = a - if e.v1.strand == e.v2.strand: - for a in d2list: - if (a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary) in d2reads: - multi_r.add((a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary)) - d2reads[(a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary)] = a - else: - for a in d2list: - if (a.query_name, a.is_read1, a.is_reverse, not a.is_secondary) in d2reads: - multi_r.add((a.query_name, a.is_read1, a.is_reverse, not a.is_secondary)) - d2reads[(a.query_name, a.is_read1, a.is_reverse, not a.is_secondary)] = a - - dpairs = defaultdict(lambda: [], {}) - for aa in rSet: - if a.query_name in multi_r: - continue - a1 = d1reads[aa] - a2 = d2reads[aa] - a1clip_prefix = 0 - for a1c in a1.cigartuples: - if a1c[0] == 5: - a1clip_prefix += a1c[1] - else: - break - a2clip_prefix = 0 - for a2c in a2.cigartuples: - if a2c[0] == 5: - a2clip_prefix += a2c[1] - else: - break - a1clip_suffix = 0 - for a1c in a1.cigartuples[::-1]: - if a1c[0] == 5: - a1clip_suffix += a1c[1] - else: - break - a2clip_suffix = 0 - for a2c in a2.cigartuples[::-1]: - if a2c[0] == 5: - a2clip_suffix += a2c[1] - else: - break - - if a1.is_reverse: - r1 = ( - a1.infer_query_length() + a1clip_suffix - a1.query_alignment_end, - a1.infer_query_length() + a1clip_suffix - a1.query_alignment_start - 1, - ) - else: - r1 = (a1.query_alignment_start + a1clip_prefix, a1.query_alignment_end - 1 + a1clip_prefix) - if a2.is_reverse: - r2 = ( - a2.infer_query_length() + a2clip_suffix - a2.query_alignment_end, - a2.infer_query_length() + a2clip_suffix - a2.query_alignment_start - 1, - ) - else: - r2 = (a2.query_alignment_start + a2clip_prefix, a2.query_alignment_end - 1 + a2clip_prefix) - - if r1[0] <= r2[0] and r1[1] <= r2[1]: - hom = r1[1] - r2[0] + 1 - prefix = True - elif r1[0] >= r2[0] and r1[1] >= r2[1]: - hom = r2[1] - r1[0] + 1 - prefix = False - else: - continue - - if ((e.v1.strand == 1) == (not a1.is_reverse)) != prefix: - continue - - if hom > 0: - # p1 = a1.reference_end - hom - 1 if e.v1.strand == 1 else a1.reference_start + hom - # p2 = a2.reference_end - hom - 1 if e.v2.strand == 1 else a2.reference_start + hom - p1 = a1.reference_end - 1 if e.v1.strand == 1 else a1.reference_start - p2 = a2.reference_end - 1 if e.v2.strand == 1 else a2.reference_start - else: - p1 = a1.reference_end - 1 if e.v1.strand == 1 else a1.reference_start - p2 = a2.reference_end - 1 if e.v2.strand == 1 else a2.reference_start - if (e.v1.chrom, e.v1.pos, e.v1.strand) != (e.v2.chrom, e.v2.pos, e.v2.strand): - dpairs[(hom, p1, p2)].append((a1, a2, r1, r2)) - elif p1 >= p2: - dpairs[(hom, p1, p2)].append((a1, a2, r1, r2)) - - if len(dpairs) == 0: - return (e, 0, [], None) - max_s = max([len(s) for s in dpairs.values()]) - max_p = [p for p in dpairs.keys() if len(dpairs[p]) == max_s] - - if len(max_p) != 1: - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " refine discordant edge max_p not 1 " - + str(e) - + " " - + str(max_p) - ) - return (e, 0, [], None) - hom = max_p[0][0] - hom_seq = "" - if dpairs[max_p[0]][0][0].is_secondary: - vstrand = e.v2.strand - a = dpairs[max_p[0]][0][1] - else: - vstrand = e.v1.strand - a = dpairs[max_p[0]][0][0] - if hom >= 0: - if vstrand == 1: - hom_seq = a.query_sequence[a.query_alignment_end - hom : a.query_alignment_end] - else: - hom_seq = a.query_sequence[a.query_alignment_start : a.query_alignment_start + hom] - else: - if vstrand == 1: - hom_seq = a.query_sequence[a.query_alignment_end : a.query_alignment_end + abs(hom)] - else: - hom_seq = a.query_sequence[a.query_alignment_start - abs(hom) : a.query_alignment_start] - p1 = max_p[0][1] - p2 = max_p[0][2] - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " refine discordant edge found " - + str( - breakpoint_edge( - breakpoint_vertex(e.v1.chrom, p1, e.v1.strand), breakpoint_vertex(e.v2.chrom, p2, e.v2.strand) - ) - ) - + " " - + str(hom) - + " " - + str(len(dpairs[max_p[0]])) - + " " - + str(len(rSet)) - ) - return ( - breakpoint_edge( - breakpoint_vertex(e.v1.chrom, p1, e.v1.strand), - breakpoint_vertex(e.v2.chrom, p2, e.v2.strand), - hom=hom, - hom_seq=hom_seq, - ), - hom, - dpairs[max_p[0]], - hom_seq, - ) - - def edge_has_high_mapq(self, read_list): - bp1_mapq = max([rr[0].mapping_quality for rr in read_list]) - bp2_mapq = max([rr[1].mapping_quality for rr in read_list]) - logging.debug("#TIME " + "%.3f\t" % (time() - self.tstart) + " breakpoint_mapq: %d %d" % (bp1_mapq, bp2_mapq)) - if bp1_mapq < self.breakpoint_mapping_quality_cutoff: - return False - if bp2_mapq < self.breakpoint_mapping_quality_cutoff: - return False - return True - - def edge_has_high_entropy(self, read_list): - try: - bp1_entropy = max( - [ - stats.entropy( - np.unique([x for x in rr[0].get_reference_sequence().upper() if x != "N"], return_counts=True)[ - 1 - ] - ) - for rr in read_list - ] - ) - bp2_entropy = max( - [ - stats.entropy( - np.unique([x for x in rr[1].get_reference_sequence().upper() if x != "N"], return_counts=True)[ - 1 - ] - ) - for rr in read_list - ] - ) - except ValueError: - # if the MD tag is missing from the BAM file (e.g. Isaac was used as the aligner, or some CRAM files), instead use the query sequence for entropy calc. - bp1_entropy = max( - [ - stats.entropy( - np.unique([x for x in rr[0].query_alignment_sequence.upper() if x != "N"], return_counts=True)[ - 1 - ] - ) - for rr in read_list - ] - ) - bp2_entropy = max( - [ - stats.entropy( - np.unique([x for x in rr[1].query_alignment_sequence.upper() if x != "N"], return_counts=True)[ - 1 - ] - ) - for rr in read_list - ] - ) - - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " breakpoint_entropy: %.3f %.3f" % (bp1_entropy, bp2_entropy) - ) - - if bp1_entropy < self.breakpoint_entropy_cutoff: - return False - if bp2_entropy < self.breakpoint_entropy_cutoff: - return False - return True - - def edge_passes_filters(self, read_list, e=None): - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " edge_breakpoint_filter: " + str(e)) - if self.edge_has_high_mapq(read_list) and self.edge_has_high_entropy(read_list): - return True - return False - - def sa_tag_overlaps_primary(self, a): - if not a.has_tag("SA"): - return False - t = a.get_tag("SA").split(",") - if t[0] != a.reference_name: - return False - if (t[2] == "+") != a.is_reverse: - return False - if min(abs(int(t[1]) - a.reference_start), abs(int(t[1]) - a.reference_end)) > self.read_length: - return False - return True - - def sa_tag_mismatch_breakpoint(self, a, bp): - if not a.has_tag("SA"): - return False - t = a.get_tag("SA").split(",") - if t[0] != a.reference_name: - return True - if (t[2] == "+") != a.is_reverse: - return True - if bp.strand == -1 and (a.reference_start != bp.pos or int(t[1]) != bp.pos): - return True - if bp.strand == 1: - if abs(a.reference_end - bp.pos) > 10: - return True - cigar_counts = [int(i) for i in re.findall(r"\d+", t[3])] - cigar_op = [i for i in re.findall(r"\D", t[3])] - sa_ref_len = sum([i[0] for i in zip(cigar_counts, cigar_op) if i[1] in "MDNX"]) - if abs(int(t[1]) + sa_ref_len - bp.pos) > 10: - return True - return False - - def interval_discordant_edges(self, interval, filter_repeats=True, pair_support=-1, ms=None, amplicon_name=None): - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " searching discordant edges in " + str(interval)) - if pair_support == -1: - pair_support = self.pair_support - if type(interval) != hg.interval_list: - ilist = hg.interval_list([interval]) - else: - ilist = interval - if ( - tuple([(i.chrom, i.start, i.end) for i in ilist]), - filter_repeats, - pair_support, - not ms is None, - ) in self.discordant_edge_calls: - return self.discordant_edge_calls[ - (tuple([(i.chrom, i.start, i.end) for i in ilist]), filter_repeats, pair_support, not ms is None) - ] - - interval = ilist[0] - dflist = [] - drlist = [] - for i in ilist: - dflist += [ - a - for a in self.fetch(i.chrom, max(1, i.start), i.end) - if not a.is_unmapped - and not a.is_reverse - and a.is_paired - and not a.is_proper_pair - and not a.mate_is_unmapped - and not a.is_secondary - and a.reference_end is not None - and a.mapping_quality > self.mapping_quality_cutoff - and not ( - a.reference_name == a.next_reference_name - and a.mate_is_reverse - and abs(a.reference_start - a.next_reference_start) < self.max_insert - ) - ] # this section catches everted sequencing artifacts - drlist += [ - a - for a in self.fetch(i.chrom, max(1, i.start), i.end) - if not a.is_unmapped - and a.is_reverse - and a.is_paired - and not a.is_proper_pair - and not a.mate_is_unmapped - and not a.is_secondary - and a.reference_end is not None - and a.mapping_quality > self.mapping_quality_cutoff - and not ( - a.reference_name == a.next_reference_name - and not a.mate_is_reverse - and abs(a.reference_start - a.next_reference_start) < self.max_insert - ) - ] # this section catches everted sequencing artifacts - # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " discordant edges: fetch discordant " + str(interval) + " " + str(len(dflist)) + " " + str(len(drlist))) - # dflist = [a for a in dflist if not(a.reference_name == a.next_reference_name and a.mate_is_reverse and abs(a.template_length) < self.max_insert)] - # drlist = [a for a in drlist if not(a.reference_name == a.next_reference_name and not a.mate_is_reverse and abs(a.template_length) < self.max_insert)] - - # dflist = [a for a in dflist if not(a.reference_name == a.next_reference_name and a.mate_is_reverse and abs(a.reference_start - a.next_reference_start) < self.max_insert)] - # drlist = [a for a in drlist if not(a.reference_name == a.next_reference_name and not a.mate_is_reverse and abs(a.reference_start - a.next_reference_start) < self.max_insert)] - - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " discordant edges: discordant read pairs found: %s %s %s" % (str(interval), len(dflist), len(drlist)) - ) - - # perform biclustering for readpairs using union-find algorithm to give sets of connected read-pairs clist - vlist = [] - vcount = 0 - vdict = {} - for a in dflist + drlist: - vlist.append( - ( - hg.absPos(a.reference_name, a.reference_start) * (-1 if a.is_reverse else 1), - hg.absPos(a.next_reference_name, a.next_reference_start) * (-1 if a.mate_is_reverse else 1), - a, - vcount, - ) - ) - vdict[vcount] = a - vcount += 1 - # vlist = [(hg.absPos(a.reference_name, a.reference_start) * (-1 if a.is_reverse else 1), hg.absPos(a.next_reference_name, a.next_reference_start) * (-1 if a.mate_is_reverse else 1), a) for a in dflist + drlist] - v0list = copy.copy(vlist) - v0list.sort(key=lambda x: x[0]) - v1list = copy.copy(vlist) - v1list.sort(key=lambda x: x[1]) - dlist = [] - v0listp = [v[0] for v in v0list] - v1listp = [v[1] for v in v1list] - plist = defaultdict(lambda: None, {}) - rlist = defaultdict(lambda: 0, {}) - nlist = defaultdict(lambda: 1, {}) - # identify edges with bisect and union-find algorithm - # iii = 0 - for v in vlist: - # iii += 1 - s0 = bisect.bisect_left(v0listp, v[0] - self.max_insert + self.read_length) - e0 = bisect.bisect_right(v0listp, v[0] + self.max_insert - self.read_length) - s1 = bisect.bisect_left(v1listp, v[1] - self.max_insert + self.read_length) - e1 = bisect.bisect_right(v1listp, v[1] + self.max_insert - self.read_length) - SS0 = [vv[3] for vv in v0list[s0 : e0 + 1] if vv[3] > v[3]] - SS1 = [vv[3] for vv in v1list[s1 : e1 + 1] if vv[3] > v[3]] - SS0.sort() - SS1.sort() - SS_intersect = [] - i0 = 0 - i1 = 0 - while True: - if i0 == len(SS0) or i1 == len(SS1): - break - if SS0[i0] == SS1[i1]: - SS_intersect.append(SS0[i0]) - i0 += 1 - i1 += 1 - elif SS0[i0] < SS1[i1]: - i0 += 1 - else: - i1 += 1 - if len(SS_intersect) >= pair_support: - dlist.append(v[2]) - v1 = v[3] - for v2 in SS_intersect: - v1g = v1 - v2g = v2 - while plist[v1g] is not None: - v1g = plist[v1g] - while plist[v2g] is not None: - v2g = plist[v2g] - if v1g == v2g: - continue - if rlist[v1g] > rlist[v2g]: - plist[v2g] = v1g - rlist[v1g] = max(rlist[v1g], rlist[v2g] + 1) - nlist[v1g] += nlist[v2g] - else: - plist[v1g] = v2g - rlist[v2g] = max(rlist[v2g], rlist[v1g] + 1) - nlist[v2g] += nlist[v1g] - clist = defaultdict(lambda: [], {}) - for v in plist: - vg = v - while plist[vg] is not None: - vg = plist[vg] - clist[vdict[vg]].append(vdict[v]) - - mcdflist = [] - mcdrlist = [] - hgddict = {} - for c in clist: - if len(clist[c]) < pair_support: - continue - ml = clist[c] - if filter_repeats: - ml = [ - v - for v in clist[c] - if not hg.interval(v, bamfile=self.bamfile).filter_repeat() - and v.mapping_quality > self.mapping_quality_cutoff - ] - if len(ml) < pair_support: - continue - hgl = hg.interval_list([]) - for v in ml: - hgv = hg.interval(v, bamfile=self.bamfile) - hgddict[hgv] = v - hgl.append(hgv) - hgl.sort() - if c.is_reverse: - mcdrlist.extend(hgl.merge_clusters(extend=self.max_insert - self.read_length)) - else: - mcdflist.extend(hgl.merge_clusters(extend=self.max_insert - self.read_length)) - - # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " discordant edges: discordant clusters found: %s %d %d " % (str(interval), len(mcdflist), len(mcdrlist))) - - dnlist0 = [] - dnlist = [] - clist = hg.interval_list([c[0] for c in mcdflist + mcdrlist]) - clist.sort() - ci = 0 - for c1 in mcdflist + mcdrlist: - ci += 1 - neighbor_hglist = hg.interval_list([]) - for a1 in c1[1]: - neighbor_hglist.append( - hg.interval( - hgddict[a1].next_reference_name, - hgddict[a1].next_reference_start, - hgddict[a1].next_reference_start, - ) - ) - neighbor_hglist.sort() - neighbor_hglist = hg.interval_list( - [ - a2[0] - for a2 in neighbor_hglist.merge_clusters(extend=self.max_insert - self.read_length) - if len(a2[1]) >= pair_support - ] - ) - for c2 in mcdflist + mcdrlist: - if len(hg.interval_list([c2[0]]).intersection(neighbor_hglist, extend=self.max_insert)) == 0: - continue - vl = [] - vlSet = set() - vl1Set = set() - vl2Set = set() - for a1 in c1[1]: - for a2 in c2[1]: - aq1 = hgddict[a1] - aq2 = hgddict[a2] - if aq1.query_name == aq2.query_name and aq1.is_read1 != aq2.is_read1: - if ( - aq1.reference_name == aq2.reference_name - and abs(aq1.reference_start - aq2.reference_start) < self.read_length - and abs(aq1.reference_end - aq2.reference_end) < self.read_length - and aq1.is_reverse != aq2.is_reverse - ): - continue - if ( - aq1.reference_name == aq2.reference_name - and aq1.is_reverse - and not aq2.is_reverse - and aq1.reference_start - aq2.reference_end + 1 > 0 - and aq1.reference_start - aq2.reference_end + 1 < self.max_insert - 2 * self.read_length - ): - continue - if ( - aq2.reference_name == aq1.reference_name - and aq2.is_reverse - and not aq1.is_reverse - and aq2.reference_start - aq1.reference_end + 1 > 0 - and aq2.reference_start - aq1.reference_end + 1 < self.max_insert - 2 * self.read_length - ): - continue - vl.append((aq1, aq2)) - vlSet.add((aq1.reference_start, aq1.reference_end, aq2.reference_start, aq2.reference_end)) - vl1Set.add((aq1.reference_start, aq1.reference_end)) - vl2Set.add((aq2.reference_start, aq2.reference_end)) - if len(vl) == 0 or len([v for v in vl if v[1].reference_start * v[0].reference_start > 0]) == 0: - continue - if not vl[0][0].is_reverse: - bp1 = breakpoint_vertex( - c1[0].chrom, max([v[0].reference_end - 1 for v in vl if v[0].reference_start > 0]), 1 - ) - else: - bp1 = breakpoint_vertex( - c1[0].chrom, min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1 - ) - if not vl[0][1].is_reverse: - bp2 = breakpoint_vertex( - c2[0].chrom, max([v[1].reference_end - 1 for v in vl if v[1].reference_start > 0]), 1 - ) - else: - bp2 = breakpoint_vertex( - c2[0].chrom, min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1 - ) - if ms is None: - ps = pair_support - else: - ps = self.pair_support_count(bp1.chrom, bp1.pos, bp1.strand, ms) - if len(vl) < ps or len(vl1Set) < pair_support or len(vl2Set) < pair_support: - continue - - if ( - bp1.chrom == bp2.chrom - and bp1.pos == bp2.pos - and bp1.strand == bp2.strand - and len(vl) < 2 * self.pair_support - ): - continue - - num_inverted = 0 - bp1c = None - bp2c = None - vl2 = [] - if bp1.chrom == bp2.chrom and bp1.strand == bp2.strand and abs(bp1.pos - bp2.pos) <= self.read_length: - non_inverted_reads = set() - multiple_non_inverted = False - if bp1.strand == 1: - for v in vl: - if v[0].reference_start == v[1].reference_start: - num_inverted += 1 - elif self.sa_tag_overlaps_primary(v[0]): - num_inverted += 1 - elif self.sa_tag_overlaps_primary(v[1]): - num_inverted += 1 - else: - vl2.append(v) - if not multiple_non_inverted: - non_inverted_reads.add(v[0].query_name) - if len(non_inverted_reads) >= ps: - multiple_non_inverted = True - else: - for v in vl: - if v[0].reference_end == v[1].reference_end: - num_inverted += 1 - elif self.sa_tag_overlaps_primary(v[0]): - num_inverted += 1 - elif self.sa_tag_overlaps_primary(v[1]): - num_inverted += 1 - else: - vl2.append(v) - if not multiple_non_inverted: - non_inverted_reads.add(v[0].query_name) - if len(non_inverted_reads) >= ps: - multiple_non_inverted = True - logging.debug( - "checking foldback2: " - + str(bp1) - + str(bp2) - + " %s %s %d %d %d" % (bp1.strand, bp2.strand, len(vl), num_inverted, ps) - ) - - if len(vl2) < ps or (not multiple_non_inverted): - logging.debug("FOLDBACK: " + str(bp1) + str(bp2)) - continue - vl = vl2 - vl.sort(key=lambda x: x[0].reference_start - x[1].reference_start) - if bp1.strand == 1: - maxp = vl[0][0].reference_end - 1 - maxn = 0 - for v in vl[::-1]: - if ( - len( - [ - v1 - for v1 in vl - if v1[0].reference_end <= v[0].reference_end - and v1[0].reference_start - > v[0].reference_end - 1 - self.max_insert + 2 * self.read_length - ] - ) - > maxn - ): - maxp = v[0].reference_end - maxn = len( - [ - v1 - for v1 in vl - if v1[0].reference_end <= v[0].reference_end - and v1[0].reference_end - > v[0].reference_end - self.max_insert + 2 * self.read_length - ] - ) - vl = [ - v - for v in vl - if v[0].reference_end - 1 <= maxp - and v[0].reference_end - 1 > maxp - self.max_insert + 2 * self.read_length - ] - if len(vl) < ps: - continue - bp1 = breakpoint_vertex( - c1[0].chrom, max([v[0].reference_end - 1 for v in vl if v[0].reference_start > 0]), 1 - ) - bp2 = breakpoint_vertex( - c2[0].chrom, max([v[1].reference_end - 1 for v in vl if v[1].reference_start > 0]), 1 - ) - if bp1.pos != bp2.pos: - bp1c = bp2 - bp2c = bp1 - else: - maxp = vl[-1][0].pos - maxn = 0 - for v in vl: - if ( - len( - [ - v1 - for v1 in vl - if v1[0].reference_start >= v[0].reference_start - and v1[0].reference_start - < v[0].reference_start + self.max_insert - 2 * self.read_length - ] - ) - > maxn - ): - maxp = v[0].reference_start - maxn = len( - [ - v1 - for v1 in vl - if v1[0].reference_start >= v[0].reference_start - and v1[0].reference_start - < v[0].reference_start + self.max_insert - 2 * self.read_length - ] - ) - vl = [ - v - for v in vl - if v[0].reference_start >= maxp - and v[0].reference_start < maxp + self.max_insert - 2 * self.read_length - ] - if len(vl) < ps: - continue - bp1 = breakpoint_vertex( - c1[0].chrom, min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1 - ) - bp2 = breakpoint_vertex( - c2[0].chrom, min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1 - ) - if bp1.pos != bp2.pos: - bp1c = bp2 - bp2c = bp1 - bre_refine = self.refine_discordant_edge(breakpoint_edge(bp1, bp2)) - bre = bre_refine[0] - - if bp1.chrom == bp2.chrom and bp1.strand == bp2.strand and abs(bp1.pos - bp2.pos) <= self.read_length: - qname_exclude = set([]) - for v in vl: - if (bp1.strand == 1 and max(v[0].reference_start, v[1].reference_start) > bre.v1.pos) or ( - bp1.strand == -1 and max(v[0].reference_end, v[1].reference_end) < bre.v1.pos - ): - qname_exclude.add(v[0].query_name) - continue - if ( - self.sa_tag_mismatch_breakpoint(v[0], bre.v1) - or self.sa_tag_mismatch_breakpoint(v[0], bre.v1) - or self.sa_tag_overlaps_primary(v[0]) - or self.sa_tag_overlaps_primary(v[1]) - ): - qname_exclude.add(v[0].query_name) - continue - if ( - bp1.strand == 1 - and bre.v1.pos - v[0].reference_start + bre.v2.pos - v[1].reference_start > self.max_insert - ): - qname_exclude.add(v[0].query_name) - continue - if ( - bp2.strand == 1 - and v[0].reference_end - bre.v1.pos + v[1].reference_end - bre.v2.pos > self.max_insert - ): - qname_exclude.add(v[0].query_name) - continue - vl = [v for v in vl if v[0].query_name not in qname_exclude] - if len(vl) < ps: - continue - - if bre.type() == "everted" and abs(bre.v1.pos - bre.v2.pos) < self.max_insert: - logging.debug("skipping everted edge " + str(bp1) + str(bp2)) - continue - if bre.type() != "concordant": - if self.edge_passes_filters(vl, bre): - dnlist0.append((bre, len(vl))) - if bp1c is not None and bp2c is not None: - brec_refine = self.refine_discordant_edge(breakpoint_edge(bp1c, bp2c)) - brec = brec_refine[0] - if brec.type() != "concordant" and brec.v1.pos != brec.v2.pos: - if self.edge_passes_filters(vl, brec): - dnlist0.append((brec, len([(v[1], v[0]) for v in vl]))) - - # remove local edges with no complementary edges and add warning if any found - for bb1 in dnlist0: - for bb2 in dnlist0: - bre1 = bb1[0] - bre2 = bb2[0] - if bre1 == bre2 and (bre1.v1.chrom, bre1.v1.pos, bre1.v1.strand) != ( - bre1.v2.chrom, - bre1.v2.pos, - bre1.v2.strand, - ): - continue - if ( - (bre2.v2.chrom, bre2.v2.pos, bre2.v2.strand) == (bre1.v1.chrom, bre1.v1.pos, bre1.v1.strand) - and (bre2.v1.chrom, bre2.v1.pos, bre2.v1.strand) == (bre1.v2.chrom, bre1.v2.pos, bre1.v2.strand) - ) and bb1 not in dnlist: - dnlist.append(bb1) - continue - if len(dnlist) != len(dnlist0): - logging.debug("dnlists do not match " + str(len(dnlist0)) + " " + str(len(dnlist))) - for bb1 in dnlist0: - if bb1 not in dnlist: - logging.debug("dnlist0: " + str(bb1[0]) + " " + str(bb1[1])) - for bb1 in dnlist: - if bb1 not in dnlist0: - logging.debug("dnlist: " + str(bb1[0]) + " " + str(bb1[1])) - - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " discordant edges: local edges done " - + str(interval) - + " " - + str(len(mcdflist)) - + " " - + str(len(mcdrlist)) - + " " - + str(len(dnlist)) - ) - self.get_mates_time = 0 - self.get_mates_num_calls = 0 - for c in mcdflist + mcdrlist: - nlist = [] - if filter_repeats: - if len(hg.interval_list([c[0]]).intersection(hg.conserved_regions)) > 0: - continue - rep_content_time = 0 - intersection_time = 0 - nr_calls = 0 - for hga in c[1]: - nmatelist = self.get_mates(hgddict[hga]) - if filter_repeats: - rpc = time() - nmatelist = [ - a - for a in nmatelist - if not hg.interval(a, bamfile=self.bamfile).filter_repeat() - and a.mapping_quality > self.mapping_quality_cutoff - ] - nr_calls += len(nmatelist) - rep_content_time += time() - rpc - ict = time() - nmatelist = [ - a - for a in nmatelist - if len(hg.interval_list([hg.interval(a, bamfile=self.bamfile)]).intersection(ilist)) == 0 - ] - intersection_time += time() - ict - nlist += nmatelist - nflist = [n for n in nlist if not n.is_reverse] - nrlist = [n for n in nlist if n.is_reverse] - hgndict = {hg.interval(a, bamfile=self.bamfile): a for a in nflist + nrlist} - hgnflist = hg.interval_list([hga for hga in hgndict if hga.strand == 1]) - hgnrlist = hg.interval_list([hga for hga in hgndict if hga.strand == -1]) - hgnflist.sort() - hgnrlist.sort() - mcnflist = hgnflist.merge_clusters(self.max_insert - 2 * self.read_length) - mcnrlist = hgnrlist.merge_clusters(self.max_insert - 2 * self.read_length) - mcnflist = [m for m in mcnflist if len(m[1]) >= pair_support] - mcnrlist = [m for m in mcnrlist if len(m[1]) >= pair_support] - mcnlist = mcnflist + mcnrlist - for cn in mcnlist: - vl = [] - vlSet = set() - vl1Set = set() - vl2Set = set() - if filter_repeats: - if len(hg.interval_list([cn[0]]).intersection(hg.conserved_regions)) > 0: - continue - hgmi = 0 - for hgm in cn[1]: - hgmi += 1 - if filter_repeats: - if hgm.filter_repeat() or hgndict[hgm].mapping_quality <= self.mapping_quality_cutoff: - continue - for a in self.get_mates(hgndict[hgm]): - if filter_repeats: - if ( - hg.interval(a, bamfile=self.bamfile).filter_repeat() - or a.mapping_quality <= self.mapping_quality_cutoff - ): - continue - if hg.interval(a, bamfile=self.bamfile).intersects(c[0]): - vl.append((a, hgndict[hgm])) - vlSet.add( - ( - a.reference_start, - a.reference_end, - hgndict[hgm].reference_start, - hgndict[hgm].reference_end, - ) - ) - vl1Set.add((a.reference_start, a.reference_end)) - vl2Set.add((hgndict[hgm].reference_start, hgndict[hgm].reference_end)) - break - if len(vl) == 0 or len([v for v in vl if v[1].reference_start * v[0].reference_start > 0]) == 0: - continue - if not vl[0][0].is_reverse: - bp1 = breakpoint_vertex(vl[0][0].reference_name, max([v[0].reference_end - 1 for v in vl]), 1) - else: - bp1 = breakpoint_vertex( - vl[0][0].reference_name, min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1 - ) - if not vl[0][1].is_reverse: - bp2 = breakpoint_vertex(vl[0][1].reference_name, max([v[1].reference_end - 1 for v in vl]), 1) - else: - bp2 = breakpoint_vertex( - vl[0][1].reference_name, min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1 - ) - if ms is None: - ps = pair_support - else: - ps = self.pair_support_count(bp1.chrom, bp1.pos, bp1.strand, ms) - - if len(vl) < ps or len(vl1Set) < pair_support or len(vl2Set) < pair_support: - continue - num_inverted = 0 - non_inverted_reads = set() - multiple_non_inverted = False - if bp1.chrom == bp2.chrom and bp1.pos == bp2.pos and bp1.strand == bp2.strand: - if bp1.strand == 1: - for v in vl: - if v[0].reference_start == v[1].reference_start: - num_inverted += 1 - elif not multiple_non_inverted: - non_inverted_reads.add(v[0].query_name) - if len(non_inverted_reads) >= ps: - multiple_non_inverted = True - else: - for v in vl: - if v[0].reference_end == v[1].reference_end: - num_inverted += 1 - elif not multiple_non_inverted: - non_inverted_reads.add(v[0].query_name) - if len(non_inverted_reads) >= ps: - multiple_non_inverted = True - if len(vl) - num_inverted < ps or (not multiple_non_inverted): - continue - bre_refine = self.refine_discordant_edge(breakpoint_edge(bp1, bp2)) - bre = bre_refine[0] - if bre.type() != "concordant": - if self.edge_passes_filters(vl, bre): - dnlist.append((bre, len(vl))) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " discordant edges: external edges done " - + str(interval) - + " " - + str(self.get_mates_time) - + " " - + str(self.get_mates_num_calls) - ) - dnlist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.5 * x[0].v1.strand) - for e in dnlist: - logging.debug( - "#TIME %.3f\tdiscordant edges %s %s %s %s %d %f" - % ( - time() - TSTART, - e[0], - e[1], - e[0].type(), - self.concordant_edge(e[0].v1)[0], - len(self.concordant_edge(e[0].v1)[1]), - hg.interval( - e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos - e[0].v1.strand * self.max_insert - ).rep_content(), - ) - ) - self.discordant_edge_calls[ - (tuple([(i.chrom, i.start, i.end) for i in ilist]), filter_repeats, pair_support, not ms is None) - ] = dnlist - return dnlist - - def load_edges(self, edge_file): - edge_lines = [line.strip().split() for line in open(edge_file)] - edges = [] - for el in edge_lines: - if el[2] == "None": - hom = None - hom_seq = None - else: - hom = int(el[2]) - if hom != 0: - hom_seq = el[3] - else: - hom_seq = "" - e = breakpoint_edge(el[0], hom=hom, hom_seq=hom_seq) - edges.append((e, int(el[1]))) - edges.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - return edges - - def get_sensitive_discordant_edges( - self, - ilist, - msrlist, - eilist=None, - filter_repeats=True, - pair_support=-1, - ms_window_size0=10000, - ms_window_size1=300, - adaptive_counts=True, - gcc=False, - amplicon_name=None, - ): - if amplicon_name is not None and os.path.exists("%s_edges_cnseg.txt" % amplicon_name): - return self.load_edges("%s_edges_cnseg.txt" % amplicon_name) - if amplicon_name is not None and os.path.exists("%s_edges.txt" % amplicon_name): - eilist = self.load_edges("%s_edges.txt" % amplicon_name) - else: - if eilist is None: - if adaptive_counts: - eilist = self.interval_discordant_edges(ilist, ms=msrlist, pair_support=pair_support) - else: - eilist = self.interval_discordant_edges(ilist, pair_support=pair_support) - eilist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - if amplicon_name is not None: - edge_file = open("%s_edges.txt" % amplicon_name, "w") - for e in eilist: - edge_file.write("%s\t%s\t%s\t%s\n" % (str(e[0]), e[1], e[0].hom, e[0].hom_seq)) - edge_file.close() - eiSet = set( - [(e[0].v1.chrom, e[0].v1.pos, e[0].v1.strand, e[0].v2.chrom, e[0].v2.pos, e[0].v2.strand) for e in eilist] - ) - for i, msr in zip(ilist, msrlist): - elist = [] - for e in eilist: - if e[0].v1.pos != -1 and hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i): - elist.append(e) - ms_vlist = [] - msv_index = {} - for msi in range(len(msr) - 1): - if msr[msi + 1].info["cn"] < msr[msi].info["cn"]: - msv = breakpoint_vertex(i.chrom, msr[msi].end, 1) - else: - msv = breakpoint_vertex(i.chrom, msr[msi].end + 1, -1) - ms_vlist.append(msv) - msv_index[msv] = msi - print("Meanshift", str(i), len(ms_vlist), ms_vlist) - sys.stdout.flush() - for msv in ms_vlist: - msi = msv_index[msv] - if ("end_refined" in msr[msi].info) and msr[msi].info["end_refined"]: - msve = [ - e - for e in elist - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size1 - ] - if len(msve) == 0: - # print("finesearch discordant edges", i.chrom, str(msr[msi]), str(msr[msi + 1])) - efine = self.interval_discordant_edges( - hg.interval( - i.chrom, - msv.pos - ms_window_size0 - self.max_insert, - msv.pos + ms_window_size1 + self.max_insert, - ), - pair_support=2, - ) - if ( - len( - [ - e - for e in efine - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - ] - ) - > 0 - ): - if ( - len( - [ - (e[1], e[0]) - for e in efine - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - and abs(e[0].v1.pos - msv.pos) < ms_window_size1 - ] - ) - > 0 - ): - ebest = max( - [ - (e[1], e[0]) - for e in efine - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - and abs(e[0].v1.pos - msv.pos) < ms_window_size1 - ] - ) - else: - ebest = max( - [ - (e[1], e[0]) - for e in efine - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - ] - ) - ebest = (ebest[1], ebest[0]) - # msve = [ebest] - - # print("finesearch discordant edge found", i.chrom, str(msr[msi]), str(msr[msi + 1]), str(ebest[0]), ebest[1]) - if ( - ebest[0].v1.chrom, - ebest[0].v1.pos, - ebest[0].v1.strand, - ebest[0].v2.chrom, - ebest[0].v2.pos, - ebest[0].v2.strand, - ) not in eiSet: - elist.append(ebest) - eilist.append(ebest) - eiSet.add( - ( - ebest[0].v1.chrom, - ebest[0].v1.pos, - ebest[0].v1.strand, - ebest[0].v2.chrom, - ebest[0].v2.pos, - ebest[0].v2.strand, - ) - ) - if ( - len( - hg.interval_list( - [hg.interval(ebest[0].v2.chrom, ebest[0].v2.pos, ebest[0].v2.pos)] - ).intersection(ilist) - ) - > 0 - ): - if ( - ebest[0].v2.chrom, - ebest[0].v2.pos, - ebest[0].v2.strand, - ebest[0].v1.chrom, - ebest[0].v1.pos, - ebest[0].v1.strand, - ) not in eiSet: - eilist.append((breakpoint_edge(ebest[0].v2, ebest[0].v1), ebest[1])) - eiSet.add( - ( - ebest[0].v2.chrom, - ebest[0].v2.pos, - ebest[0].v2.strand, - ebest[0].v1.chrom, - ebest[0].v1.pos, - ebest[0].v1.strand, - ) - ) - elist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - eilist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - - # Maintainer found the following lines had no effect - # else: - # # print("msv end not refined", str(msr[msi]), str(msr[msi + 1])) - # msve = [e for e in elist if e[0].v1.strand * (msr[msi].info['cn'] - msr[msi + 1].info['cn']) > 0 and abs( - # e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size0] - - if amplicon_name is not None: - edge_file = open("%s_edges_cnseg.txt" % amplicon_name, "w") - for e in eilist: - edge_file.write("%s\t%s\t%s\t%s\n" % (str(e[0]), e[1], e[0].hom, e[0].hom_seq)) - edge_file.close() - return eilist - - def construct_segment(self, v): - cpos = v.pos - v.strand * self.max_insert / 2 - cprevious = v.pos - cflag = v.pos - while abs(cflag - cprevious) < self.window_size: - cprevious = cpos - cpos = cpos - v.strand * self.max_insert / 2 - drange = [cpos, cpos + v.strand * self.max_insert] - drange.sort() - dlist = [a for a in self.fetch(v.chrom, drange[0], drange[1])] - if len(dlist) * self.read_length < self.min_coverage * self.max_insert: - continue - cflag = cprevious - if abs(cprevious - v.pos) > self.max_insert: - v1 = breakpoint_vertex(v.chrom, cprevious, v.strand) - discordant_neighbors = self.get_discordant_neighbors(v1) - if len(discordant_neighbors) > 0: - return v1 - v2 = breakpoint_vertex(v.chrom, cpos, -1 * v.strand) - discordant_neighbors = self.get_discordant_neighbors(v2) - if len(discordant_neighbors) > 0: - return v2 - return None - - # Methods to find all intervals in amplicon - def interval_neighbors(self, i, ilist=[], rdlist=[], t=0, gcc=False): - i2 = self.interval_extend(i) - # i2 = i - # i2 = self.interval_extend(i, ilist, rdlist) - ms_window_size0 = 10000 - ms_window_size1 = 300 - merge_thresh = 100000 - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + " Calculating coverage meanshift segmentation") - msrlist = [self.get_meanshift(i2, ms_window_size0, ms_window_size1, gcc)] - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + " Detecting breakpoint edges (interval neighbors)") - edges = self.interval_discordant_edges(i2, ms=msrlist) - edges = [(e[1], e[0]) for e in edges] - edges.sort(reverse=True) - edges = [(e[1], e[0]) for e in edges] - ei = 0 - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + " Selecting neighbors") - neighbors = hg.interval_list([]) - while len(neighbors) < 10 and ei < len(edges): - covered = False - for i3 in ilist + neighbors: - if ( - i3.chrom == edges[ei][0].v2.chrom - and edges[ei][0].v2.pos >= i3.start - and edges[ei][0].v2.pos <= i3.end - ): - ei += 1 - covered = True - break - if covered: - continue - found_neighbor = False - for i3 in rdlist: - if ( - i3.chrom == edges[ei][0].v2.chrom - and edges[ei][0].v2.pos >= i3.start - and edges[ei][0].v2.pos <= i3.end - ): - # n = i3 - n = hg.interval(i3.chrom, i3.start, i3.end) - found_neighbor = True - if not found_neighbor: - if edges[ei][0].v2.strand < 0: - n = self.interval_extend( - hg.interval( - edges[ei][0].v2.chrom, - edges[ei][0].v2.pos, - min(hg.chrLen[hg.chrNum(edges[ei][0].v2.chrom)] - 1, edges[ei][0].v2.pos + self.max_insert), - ) - ) - else: - n = self.interval_extend( - hg.interval( - edges[ei][0].v2.chrom, max(0, edges[ei][0].v2.pos - self.max_insert), edges[ei][0].v2.pos - ) - ) - if found_neighbor or n.size() > self.max_insert + 2: - n.info = edges[ei][1] - neighbors.append(n) - ei += 1 - neighbors.sort() - mc = neighbors.merge_clusters(extend=merge_thresh) # previously ms_window_size0 - for c in mc: - c[0].info = sum([c1.info for c1 in c[1]]) - nn = hg.interval_list([c[0] for c in mc]) - for e in nn: - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_neighbors: edges %s %s" % (str(i), str(e)) - ) - return nn - - def interval_hops(self, i=None, ilist=[], rdlist=[], gcc=False, explore=True): - if type(i) == list or type(i) == hg.interval_list: - i1list = i - i = i[0] - else: - i1list = hg.interval_list([i]) - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " interval_hops: init " + str(i)) - ms_window_size0 = 10000 - i2list = hg.interval_list([]) - for i2 in i1list: - ii = self.interval_extend(i2) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_hops: interval extend " + str(i2) + " " + str(ii) - ) - i2list.append(ii) - seen_list = hg.interval_list([]) - unseen_list = [(0, ii) for ii in i2list] - heapq.heapify(unseen_list) - # clist = hg.interval_list(i2list) - clist = hg.interval_list([ii[0] for ii in i2list.merge_clusters(extend=1)]) - while len(seen_list) < 10 and len(unseen_list) > 0: - icc = heapq.heappop(unseen_list) - ic = icc[1] - if explore == False and len(hg.interval_list([ic]).intersection(i2list)) == 0: - seen_list.append(ic) - continue - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " interval_hops: check rd " - + str(i) - + " " - + str(ic) - + " " - + str(len(hg.interval_list([ic]).intersection(rdlist))) - ) - if ( - len(hg.interval_list([ic]).intersection(i2list)) == 0 - and len(hg.interval_list([ic]).intersection(rdlist)) > 0 - ): - seen_list.append(ic) - continue - # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " interval_hops: search new " + str(i) + " " + str(ic)) - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " Searching new neighbors for interval: " + str(ic)) - icn = self.interval_neighbors(ic, clist, rdlist=rdlist, gcc=gcc) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " interval_hops: neighbors " - + str(i) - + " " - + str(ic) - + " " - + str(len(icn)) - ) - for ic2 in icn: - logging.info( - "#TIME " + "%.3f\t" % (time() - TSTART) + " New neighbor: %s (weight=%d)" % (str(ic2), ic2.info) - ) - contained = False - for i2 in clist: - if i2.contains(ic2): - contained = True - if contained: - continue - if ic2.size() < 2 * ms_window_size0 and len(self.interval_discordant_edges(ic2)) < 2: - continue - if explore or len(hg.interval_list([ic]).intersection(i2list)) > 0: - heapq.heappush(unseen_list, (-ic2.info, ic2)) - clist.append(ic2) - seen_list.append(ic) - retlist = hg.interval_list(i2list + seen_list) - retlist = [r[0] for r in retlist.merge_clusters(extend=1)] - return retlist - - def interval_amplified(self, i, filter_conserved=True, filter_small=True): - if ( - len( - hg.interval_list([i]).intersection(hg.conserved_regions) - + hg.interval_list([i]).intersection(hg.centromere_list) - ) - > 0 - ): - return False - ms_window_size = 10000 - num_w = 0 - num_high = 0 - if filter_small and i.size() < 2 * ms_window_size and len(self.interval_discordant_edges(i)) < 2: - return False - wc = self.window_coverage(i, ms_window_size, exact=False) - mc = self.median_coverage() - if self.span_coverage: - arm_coverage = self.median_coverage(refi=i) - else: - arm_coverage = mc - for w in wc: - num_w += 1 - # if w[1] > mc[0] + 3 * mc[2]: - if self.sensitivems == False: - if mc[0] < arm_coverage[0] and w[1] > max( - arm_coverage[0] + 3 * mc[2] * math.sqrt(arm_coverage[0] / mc[0]), - arm_coverage[0] + 3.0 * mc[0] / 2.0, - ): - num_high += 1 - elif mc[0] >= arm_coverage[0] and w[1] > max(mc[0] + 3 * mc[2], 5.0 * mc[0] / 2.0): - num_high += 1 - else: - if mc[0] < arm_coverage[0] and w[1] > arm_coverage[0] + 3 * mc[2] * math.sqrt(arm_coverage[0] / mc[0]): - num_high += 1 - elif mc[0] >= arm_coverage[0] and w[1] > mc[0] + 3 * mc[2]: - num_high += 1 - # wc_high = len([w for w in wc if w[1] > mc[1] + 3 * mc[2]]) - if num_high > num_w / 5: - return True - elif filter_small == False and i.size() < 2 * ms_window_size and len(self.interval_discordant_edges(i)) >= 2: - return True - else: - return False - - def interval_extend(self, i, strand=0, i0=None): - ms_window_size = 10000 - extend_size = max(i.size() / ms_window_size, 1) - max_window_size = 300000000 - if strand >= 0: - extend_right = 1 - right_size = extend_size - else: - extend_right = -1 - right_size = 0 - if strand <= 0: - extend_left = 1 - left_size = extend_size - else: - extend_left = -1 - left_size = 0 - ic = copy.copy(i) - while ic.size() < max_window_size and (extend_left >= 0 or extend_right >= 0): - if extend_right >= 0: - if right_size < 1: - extend_right = -1 - elif ic.end + right_size * ms_window_size > hg.chrLen[hg.chrNum(ic.chrom)]: - if self.interval_amplified( - hg.interval(ic.chrom, ic.end, hg.chrLen[hg.chrNum(ic.chrom)]), filter_small=False - ): - ic.end = hg.chrLen[hg.chrNum(ic.chrom)] - extend_right = -1 - else: - extend_right = 0 - right_size = right_size / 2 - elif self.interval_amplified( - hg.interval(ic.chrom, ic.end, ic.end + right_size * ms_window_size), filter_small=False - ): - ic.end = ic.end + right_size * ms_window_size - if extend_right == 1: - right_size = 2 * right_size - else: - right_size = right_size / 2 - if right_size < 1: - # ic.end = min(ic.end + ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]) - extend_right = -1 - else: - extend_right = 0 - right_size = right_size / 2 - if extend_left >= 0: - if left_size < 1: - extend_left = -1 - elif ic.start - left_size * ms_window_size <= 1: - if self.interval_amplified(hg.interval(ic.chrom, 1, ic.start), filter_small=False): - ic.start = 1 - extend_left = -1 - else: - extend_left = 0 - left_size = left_size / 2 - elif self.interval_amplified( - hg.interval(ic.chrom, ic.start - left_size * ms_window_size, ic.start), filter_small=False - ): - ic.start = ic.start - left_size * ms_window_size - if extend_left == 1: - left_size = 2 * left_size - else: - left_size = left_size / 2 - if left_size < 1: - # ic.start = max(ic.end - ms_window_size, 1) - extent_left = -1 - else: - extend_left = 0 - left_size = left_size / 2 - if self.interval_amplified( - hg.interval( - ic.chrom, - max(0, ic.end - 2 * ms_window_size), - min(ic.end + 2 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]), - ), - filter_small=False, - ): - ic.end = min(ic.end + 10 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]) - if self.interval_amplified( - hg.interval( - ic.chrom, - max(ic.start - 2 * ms_window_size, 0), - min(ic.start + 2 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]), - ), - filter_small=False, - ): - ic.start = max(ic.start - 10 * ms_window_size, 0) - if strand >= 0: - ide = self.interval_discordant_edges( - hg.interval(ic.chrom, ic.end + 1, min(hg.chrLen[hg.chrNum(ic.chrom)], ic.end + ms_window_size)) - ) - for e in ide: - if e[0].v1.strand == 1: - ic.end = min(ic.end + 2 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]) - break - if strand <= 0: - ide = self.interval_discordant_edges(hg.interval(ic.chrom, max(0, ic.start - ms_window_size), ic.start - 1)) - for e in ide: - if e[0].v1.strand == -1: - ic.start = max(ic.start - 2 * ms_window_size, 0) - break - # if ic.size() > ms_window_size: - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_extend: %s, %s, %s" % (str(i), strand, str(ic)) - ) - return ic - - # Method to create breakpoint graph, find network flow and cycle decomposition - def interval_filter_vertices( - self, ilist0, gcc=False, adaptive_counts=True, eilist=None, amplicon_name=None, runmode="FULL" - ): - ms_window_size0 = 10000 - ms_window_size1 = 300 - ilist0.sort() - ilist = hg.interval_list([a[0] for a in ilist0.merge_clusters()]) - - # finesearch edges near refined meanshifts and add to eilist, create vertices corresponding to all meanshifts and uncovered meanshifts - all_msv = [] - msv_diff = {} - all_msv_nocover = [] - logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Calculating coverage meanshift segmentation") - msrlist = [self.get_meanshift(i, ms_window_size0, ms_window_size1, gcc) for i in ilist] - logging.info( - "#TIME " + "%.3f\t" % (time() - self.tstart) + " Detecting breakpoint edges (interval filter vertices)" - ) - sensitive_elist = self.get_sensitive_discordant_edges( - ilist, - msrlist, - eilist, - ms_window_size0=ms_window_size0, - ms_window_size1=ms_window_size1, - adaptive_counts=adaptive_counts, - amplicon_name=amplicon_name, - ) - eilist = sensitive_elist - logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Building breakpoint graph") - for i, msr in zip(ilist, msrlist): - elist = [] - for e in eilist: - if hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i): - elist.append(e) - - ms_vlist = [] - msv_index = {} - for msi in range(len((msr)) - 1): - if msr[msi + 1].info["cn"] < msr[msi].info["cn"]: - msv = breakpoint_vertex(i.chrom, msr[msi].end, 1) - else: - msv = breakpoint_vertex(i.chrom, msr[msi].end + 1, -1) - msv_diff[msv] = msr[msi + 1].info["cn"] - msr[msi].info["cn"] - ms_vlist.append(msv) - msv_index[msv] = msi - all_msv.append(ms_vlist) - # logging.debug("Meanshift", str(i), len(ms_vlist), ms_vlist) - msve_match = {} - for msv in ms_vlist: - msi = msv_index[msv] - if msr[msi].info["end_refined"]: - msve = [ - e - for e in elist - if e[0].v1.strand * (msr[msi + 1].info["cn"] - msr[msi].info["cn"]) < 0 - and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size1 - ] - else: - msve = [ - e - for e in elist - if e[0].v1.strand * (msr[msi + 1].info["cn"] - msr[msi].info["cn"]) < 0 - and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size0 - ] - if len(msve) > 0: - msve_match[msv] = msve - msv_nocover = [msv for msv in ms_vlist if msv not in msve_match] - all_msv_nocover.append(msv_nocover) - # logging.debug("Meanshift no cover", str(i), msv_nocover) - - # setup graph for flow optimization - ngvlist_full = [] - elist_full = [] - ms_addlist = [] - kce = defaultdict(lambda: 0) # number of concordant reads - koe = defaultdict(lambda: 0.0) # number of reads mapping outside the interval - kbpe = defaultdict(lambda: 0.0) # number of discordant reads across breakpoint edge - new_graph = breakpoint_graph() - s = new_graph.new_vertex(ilist[0].chrom, -1, -1) - for i, msr, ms_vlist, msv_nocover in zip(ilist, msrlist, all_msv, all_msv_nocover): - ngvlist = [] - elist = [] - for e in eilist: - if hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i): - elist.append(e) - - # add vertices to new_graph - ei = 0 - nei = 0 - msi = 0 - if len(elist) == 0 or elist[ei][0].v1.strand == 1 or elist[ei][0].v1.pos > i.start: - if len(msv_nocover) == 0 or msv_nocover[msi].strand == 1 or msv_nocover[msi].pos > i.start: - nv = new_graph.new_vertex(i.chrom, i.start, -1) - ne = new_graph.new_edge(s, nv) - koe[ne] = len(self.interval_crossing_arcs(i.chrom, i.start, i.start + self.max_insert, -1, ilist)) - else: # len(ms_vlist) > 0 and ms_vlist[0].strand == -1 and ms_vlist[0].pos > i.start + self.max_insert - nv = new_graph.new_vertex(i.chrom, msv_nocover[msi].pos, msv_nocover[msi].strand) - ne = new_graph.new_edge(s, nv) - koe[ne] = len( - self.interval_crossing_arcs( - i.chrom, msv_nocover[msi].pos, msv_nocover[msi].strand + self.max_insert, -1, ilist - ) - ) - ms_addlist.append(msv_nocover[msi]) - msi += 1 - else: - nv = new_graph.new_vertex(i.chrom, elist[0][0].v1.pos, -1) - oecount = len(self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist)) - if oecount >= ( - self.pair_support - if not adaptive_counts - else self.pair_support_count(nv.chrom, nv.pos, -1, meanshift=msrlist, sensitivems=False) - ): - ne = new_graph.new_edge(s, nv) - koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist)) - ei += 1 - ngvlist.append(nv) - vc = breakpoint_vertex(ngvlist[0].chrom, ngvlist[0].pos, ngvlist[0].strand) - while ei < len(elist) or msi < len(msv_nocover): - vp = vc - vc_type = "edge" - if msi >= len(msv_nocover): - vc = elist[ei][0].v1 - ei += 1 - elif ei >= len(elist): - vc = msv_nocover[msi] - vc_type = "meanshift" - ms_addlist.append(msv_nocover[msi]) - msi += 1 - elif elist[ei][0].v1.pos < msv_nocover[msi].pos: - vc = elist[ei][0].v1 - ei += 1 - elif elist[ei][0].v1.pos == msv_nocover[msi].pos and elist[ei][0].v1.strand < msv_nocover[msi].strand: - vc = elist[ei][0].v1 - ei += 1 - else: - vc = msv_nocover[msi] - vc_type = "meanshift" - ms_addlist.append(msv_nocover[msi]) - msi += 1 - if (vc.pos == vp.pos and vc.strand <= vp.strand) or (vc.pos == vp.pos + 1 and vc.strand < vp.strand): - continue - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + "interval_filter vertices new: " + str(vc) + " " + vc_type - ) - if vc.strand == 1: - if ngvlist[nei].strand == 1: - nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) - # oecount = len(self.interval_crossing_arcs(nvc_prime.chrom, nvc_prime.pos, nvc_prime.pos + self.max_insert, -1, ilist)) - # if oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nvc_prime.chrom, nvc_prime.pos, -1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): - # ne = new_graph.new_edge(s, nvc_prime) - # koe[ne] = oecount - ce = self.concordant_edge(vp) - nce = new_graph.new_edge(ngvlist[nei], nvc_prime) - kce[nce] = len(ce[1]) - ngvlist.append(nvc_prime) - nei += 1 - nv = new_graph.new_vertex(vc.chrom, vc.pos, 1) - if vc_type == "meanshift": - oecount = len( - self.interval_crossing_arcs( - nv.chrom, max(0, nv.pos - 2 * self.max_insert), nv.pos + 2 * self.max_insert, 1, ilist - ) - ) - else: - oecount = len( - self.interval_crossing_arcs(nv.chrom, max(0, nv.pos - self.max_insert), nv.pos, 1, ilist) - ) - # if vc_type == 'meanshift' or oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nv.chrom, nv.pos, 1, meanshift=zip(ilist, msrlist, cnlist))): - if vc_type == "meanshift": - ne = new_graph.new_edge(s, nv) - koe[ne] = oecount - ngvlist.append(nv) - nei += 1 - else: - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: adding reverse edge = " - + str(vc) - ) - if ngvlist[nei].strand == 1 and not ( - ngvlist[nei].chrom == vc.chrom and ngvlist[nei].pos == vc.pos - 1 - ): - nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) - oecount = len( - self.interval_crossing_arcs( - nvc_prime.chrom, nvc_prime.pos, nvc_prime.pos + self.max_insert, -1, ilist - ) - ) - # if oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nvc_prime.chrom, nvc_prime.pos, -1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): - # ne = new_graph.new_edge(s, nvc_prime) - # koe[ne] = oecount - ce = self.concordant_edge(vp) - nce = new_graph.new_edge(ngvlist[nei], nvc_prime) - kce[nce] = len(ce[1]) - ngvlist.append(nvc_prime) - nei += 1 - if ngvlist[nei].strand == -1: - nvc_prime = new_graph.new_vertex(vc.chrom, vc.pos - 1, 1) - oecount = len( - self.interval_crossing_arcs( - nvc_prime.chrom, max(0, nvc_prime.pos - self.max_insert), nvc_prime.pos, 1, ilist - ) - ) - # if oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nvc_prime.chrom, nvc_prime.pos, 1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): - # ne = new_graph.new_edge(s, nvc_prime) - # koe[ne] = oecount - ngvlist.append(nvc_prime) - nei += 1 - nv = new_graph.new_vertex(vc.chrom, vc.pos, -1) - if vc_type == "meanshift": - oecount = len( - self.interval_crossing_arcs( - nv.chrom, max(0, nv.pos - 2 * self.max_insert), nv.pos + 2 * self.max_insert, -1, ilist - ) - ) - else: - oecount = len( - self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist) - ) - # if vc_type == 'meanshift' or oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nv.chrom, nv.pos, -1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): - if vc_type == "meanshift": - ne = new_graph.new_edge(s, nv) - koe[ne] = oecount - ce = self.concordant_edge(vc) - nce = new_graph.new_edge(ngvlist[nei], nv) - kce[nce] = len(ce[1]) - ngvlist.append(nv) - nei += 1 - # ei += 1 - if ngvlist[nei].strand == -1: - nv = new_graph.new_vertex(i.chrom, i.end, 1) - ne = new_graph.new_edge(s, nv) - koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos - self.max_insert, nv.pos, 1, ilist)) - ngvlist.append(nv) - nei += 1 - elif ngvlist[nei].strand == 1 and ngvlist[nei].pos < i.end: - nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) - oecount = len( - self.interval_crossing_arcs( - nvc_prime.chrom, - nvc_prime.pos, - min(hg.chrLen[hg.chrNum(nvc_prime.chrom)], nvc_prime.pos + self.max_insert), - -1, - ilist, - ) - ) - if oecount >= ( - self.pair_support - if not adaptive_counts - else self.pair_support_count( - nvc_prime.chrom, nvc_prime.pos, -1, meanshift=msrlist, sensitivems=False - ) - ): - ne = new_graph.new_edge(s, nvc_prime) - koe[ne] = oecount - ce = self.concordant_edge(vp) - nce = new_graph.new_edge(ngvlist[nei], nvc_prime) - kce[nce] = len(ce[1]) - ngvlist.append(nvc_prime) - nei += 1 - nv = new_graph.new_vertex(i.chrom, i.end, 1) - ne = new_graph.new_edge(s, nv) - koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos - self.max_insert, nv.pos, 1, ilist)) - ngvlist.append(nv) - nei += 1 - ngvlist_full = ngvlist_full + ngvlist - elist_full = elist_full + elist - print("MSstats", len(ms_vlist), len(ms_addlist)) - for msa in ms_vlist: - print( - "MSadd", - str(msa), - msv_diff[msa], - self.foldup_count(msa.chrom, msa.pos, msa.strand), - msa in ms_addlist, - ) # , self.pair_support_count(msa.chrom, msa.pos, msa.strand, ms, True) - for e0 in elist_full: - e = e0[0] - if len(ilist.intersection([hg.interval(e.v2.chrom, e.v2.pos, e.v2.pos)])) > 0 and e.v1.pos >= e.v2.pos: - ne = new_graph.add_edge(e) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + "interval_filter vertices: added edge e = " + str(e) - ) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: added edge ne = " - + str(ne) - + " " - + ne.edge_type - ) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: added edge ne, v1.elist = " - + str(ne.v1) - + " " - + ",".join(map(str, ne.v1.elist)) - ) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: added edge ne, v2.elist = " - + str(ne.v2) - + " " - + ",".join(map(str, ne.v2.elist)) - ) - if ne is None: - raise ValueError( - "ne is None:" - + str(e) - + " " - + str(len(e0[1])) - + "\n" - + ",".join(map(str, new_graph.vs.values())) - ) - kbpe[ne] = e0[1] - elif len(ilist.intersection([hg.interval(e.v2.chrom, e.v2.pos, e.v2.pos)])) == 0: - ne = new_graph.add_edge(breakpoint_edge(breakpoint_vertex(s.chrom, s.pos, s.strand), e.v1)) - koe[ne] = e0[1] - for nei in range(1, len(ngvlist_full)): - if ngvlist_full[nei].strand == 1: - new_graph.new_edge(ngvlist_full[nei - 1], ngvlist_full[nei], edge_type="sequence") - # else: - # new_graph.new_edge(ngvlist[nei-1], ngvlist[nei]) - for e in koe: - koe[e] = max(0.0001, koe[e]) - # set up all constants - logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Optimizing graph copy number flow") - C = self.median_coverage()[0] / 2 - print("C (haploid coverage) = ", C) - # G = new_graph - - seqlist = [e for e in new_graph.es.values() if e.edge_type == "sequence"] - n = len(seqlist) - l = [abs(e.v2.pos - e.v1.pos) + 1 for e in seqlist] - k = [len([a for a in self.fetch(e.v1.chrom, e.v1.pos, e.v2.pos)]) for e in seqlist] - # kgcc = [self.interval_coverage(hg.interval(i.chrom, e.v1.pos, e.v2.pos), gcc=True) * (e.v2.pos - e.v1.pos) / self.read_length for e in seqlist] - # k = kgcc - # kcc = [self.interval_coverage(hg.interval(e.v1.chrom, e.v1.pos, e.v2.pos)) * (e.v2.pos - e.v1.pos) for e in seqlist] - ke = {} - ke.update(kbpe) - ke.update(kce) - ke.update(koe) - K = [ - len([a for a in self.fetch(e.v1.chrom, e.v1.pos, e.v2.pos)]) - * self.read_length - / (abs(e.v2.pos - e.v1.pos) + 1.0) - for e in seqlist - ] - # edge read count kbpe defined above - bplist = [e for e in new_graph.es.values() if (e.edge_type == "discordant" or e.edge_type == "breakpoint")] - m = len(bplist) - bpdict = {bplist[bpi]: bpi for bpi in range(len(bplist))} - print( - "########## len bplist", - len(bplist), - "; ################ kbpe, kce, koe = ", - len(kbpe), - len(kce), - len(koe), - ) - - # set up problem size and coefficients - - asub = [] - aval = [] - for i in range(n): - subarr = [i] - valarr = [1.0] - for e in seqlist[i].v1.elist: - if e.edge_type == "sequence": - continue - if n + bpdict[e] in subarr: - j = subarr.index(n + bpdict[e]) - valarr[j] += -1.0 - else: - subarr.append(n + bpdict[e]) - valarr.append(-1.0) - asub.append(subarr) - aval.append(valarr) - subarr = [i] - valarr = [1.0] - for e in seqlist[i].v2.elist: - if e.edge_type == "sequence": - continue - if n + bpdict[e] in subarr: - j = subarr.index(n + bpdict[e]) - valarr[j] += -1.0 - else: - subarr.append(n + bpdict[e]) - valarr.append(-1.0) - asub.append(subarr) - aval.append(valarr) - - coeff_f = [-1 * ki for ki in k] + [-1 * ke[e] for e in bplist] - coeff_g = [C * li / self.read_length for li in l] + [ - (self.max_insert) * C / 2 / self.read_length for e in bplist - ] - const_h = [0.0001] * (n + m) - coeff_c = [C * li / self.read_length for li in l] + [ - (self.max_insert) * C / 2 / self.read_length for e in bplist - ] - - # Solve the optimization problem - res = mosek_solver.call_mosek(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h) - - wehc = {} - for msv_ilist in zip(all_msv, ilist): - slist = hg.interval_list( - [ - hg.interval("\t".join(map(str, [sq[0].v1.chrom, sq[0].v1.pos, sq[0].v2.pos, sq[1]]))) - for sq in zip(seqlist, res) - ] - ) - slist.sort() - msl = [msv_ilist[1].start] + [v.pos for v in msv_ilist[0]] + [msv_ilist[1].end] - mslist = hg.interval_list( - [hg.interval(msv_ilist[1].chrom, msl[i], msl[i + 1]) for i in range(len(msl) - 1)] - ) - for msi in mslist: - if len(hg.interval_list([msi]).intersection(slist)) == 0: - print("MSnointersection", str(msi), msl) - for s in slist: - print(str(s)) - print("=============================") - for s in seqlist: - print(str(s)) - exit() - elif sum([ap[0].intersection(ap[1]).size() for ap in hg.interval_list([msi]).intersection(slist)]) == 0: - print("MS0intersection", str(msi)) - exit() - - edge_code = defaultdict(lambda: "discordant", {"concordant": "concordant", "source": "source"}) - - graph_logger.info( - "SequenceEdge: StartPosition, EndPosition, PredictedCopyCount, AverageCoverage, Size, NumberReadsMapped" - ) - for si in range(n): - graph_logger.info( - "sequence\t" - + "\t".join( - map( - str, - [ - seqlist[si].v1, - seqlist[si].v2, - res[si], - K[si], - seqlist[si].v2.pos - seqlist[si].v1.pos, - k[si], - ], - ) - ) - ) - wehc[seqlist[si]] = float(res[si]) - graph_logger.info( - "BreakpointEdge: StartPosition->EndPosition, PredictedCopyCount, NumberOfReadPairs, HomologySizeIfAvailable(<0ForInsertions), Homology/InsertionSequence" - ) - for bpi in range(m): - # print edge_code[bplist[bpi].type()], str(bplist[bpi]), res[n + bpi], ke[bplist[bpi]], bplist[bpi].kmer_homology() - graph_logger.info( - "\t".join( - map( - str, - [ - edge_code[bplist[bpi].type()], - bplist[bpi], - res[n + bpi], - ke[bplist[bpi]], - bplist[bpi].hom, - bplist[bpi].hom_seq, - ], - ) - ) - ) - wehc[bplist[bpi]] = float(res[n + bpi]) - lenlist = len(ilist) - if len(ilist0) >= 10: - lenlist = len(ilist0) - all_msv_cat = reduce(lambda x, y: x + y, all_msv, []) - oncolist = ",".join(set([a[1].info["Name"] for a in ilist.intersection(hg.oncogene_list)])) + "," - istr = ",".join([i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in ilist]) - summary_logger.info("TotalIntervalSize = " + str(sum([a.size() for a in ilist]))) - summary_logger.info( - "AmplifiedIntervalSize = " - + str(sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5])) - ) - if len([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]) > 0: - summary_logger.info( - "AverageAmplifiedCopyCount = " - + str( - sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) for si in range(n) if res[si] >= 2.5]) - / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]) - ) - ) - else: - summary_logger.info("AverageAmplifiedCopyCount = 2") - summary_logger.info("#Chromosomes = " + str(len(set([i.chrom for i in ilist])))) - summary_logger.info("#SeqenceEdges = " + str(n)) - summary_logger.info("#BreakpointEdges = " + str(len(kbpe))) - summary_logger.info("#CoverageShifts = " + str(len(all_msv_cat))) - summary_logger.info("#MeanshiftSegmentsCopyCount>5 = " + str(len([v for v in msv_diff.values() if v > 5]))) - summary_logger.info( - "#Foldbacks = " - + str(len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1])) - ) - summary_logger.info( - "#CoverageShiftsWithBreakpointEdges = " + str(len([msa for msa in all_msv_cat if msa in ms_addlist])) - ) - - # Summary, #intervals, t talsize, size>2.5, AvgCoverage>2.5, #chromosomes, #sequenceedges, #breakpointedges, #meanshiftbreaks, #meanshift>5, #msfoldbackedges, #msfoldbackedges, #mswithoutbreakpoint, oncogenes, representativestring, #bpedgeswithcommonkmers - if len([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]) > 0: - # print '\t'.join(map(str, ["Summary:", lenlist, sum([a.size() for a in ilist]), sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]), sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) for si in range(n) if res[si] >= 2.5]) / sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]), len(Set([i.chrom for i in ilist])), n, len(kbpe), len(all_msv_cat), len([v for v in msv_diff.values() if v > 5]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if msa in ms_addlist]), oncolist, istr, len([e for e in kbpe if e.kmer_homology()])])) - print( - "\t".join( - map( - str, - [ - "Summary:", - lenlist, - sum([a.size() for a in ilist]), - sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), - sum( - [ - res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) - for si in range(n) - if res[si] >= 2.5 - ] - ) - / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), - len(set([i.chrom for i in ilist])), - n, - len(kbpe), - len(all_msv_cat), - len([v for v in msv_diff.values() if v > 5]), - len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), - len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), - len([msa for msa in all_msv_cat if msa in ms_addlist]), - oncolist, - istr, - ], - ) - ) - ) - for i in ilist: - if ( - sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 5 - and hg.interval(seqlist[si].v1.chrom, seqlist[si].v1.pos, seqlist[si].v2.pos).intersects(i) - ] - ) - == 0 - ): - print("IntervalAmplifiedSize: ", i.chrom, i.start, i.end, 0, 2) - continue - print( - "IntervalAmplifiedSize: ", - i.chrom, - i.start, - i.end, - sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 5 - and hg.interval(seqlist[si].v1.chrom, seqlist[si].v1.pos, seqlist[si].v2.pos).intersects(i) - ] - ), - sum( - [ - res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) - for si in range(n) - if res[si] >= 5 - and hg.interval(seqlist[si].v1.chrom, seqlist[si].v1.pos, seqlist[si].v2.pos).intersects(i) - ] - ) - / sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 5 - and hg.interval(seqlist[si].v1.chrom, seqlist[si].v1.pos, seqlist[si].v2.pos).intersects(i) - ] - ), - ) - else: - # print '\t'.join(map(str, ["Summary:", lenlist, sum([a.size() for a in ilist]), sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]), sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) for si in range(n)]) / sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n)]), len(Set([i.chrom for i in ilist])), n, len(kbpe), len(all_msv_cat), len([v for v in msv_diff.values() if v > 5]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if msa in ms_addlist]), oncolist, istr, len([e for e in kbpe if e.kmer_homology()])])) - print( - "\t".join( - map( - str, - [ - "Summary:", - lenlist, - sum([a.size() for a in ilist]), - sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), - sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) for si in range(n)]) - / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n)]), - len(set([i.chrom for i in ilist])), - n, - len(kbpe), - len(all_msv_cat), - len([v for v in msv_diff.values() if v > 5]), - len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), - len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), - len([msa for msa in all_msv_cat if msa in ms_addlist]), - oncolist, - istr, - ], - ) - ) - ) - - if runmode == "BPGRAPH": - return - logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Plotting SV View") - - interval_index = 1 - for i in ilist: - cycle_logger.info("Interval\t" + "\t".join([str(interval_index), i.chrom, str(i.start), str(i.end)])) - interval_index += 1 - - new_graph.cycle_decomposition(wehc, s) - - # Plot coverage, meanshift copy count estimates and discordant edges in interval - def plot_segmentation(self, ilist, amplicon_name, segments=[], scale_list=[], eilist=None, font="small"): - fighsize = 12 - figvsize = 5 - if font == "large": - matplotlib.rcParams.update({"font.size": 18}) - figvsize = 5.85 - if font == "all_amplicons": - matplotlib.rcParams.update({"font.size": 48}) - figvsize = 5.21 - fighsize = 24 - fig = plt.figure(figsize=(fighsize, figvsize)) - plt.subplots_adjust(left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=1 / 4.0, top=1 - 1 / 10.0) - # dpi = 300 - if font == "large": - plt.subplots_adjust(left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=2.1 / 5.85, top=90 / 100.0) - if font == "all_amplicons": - plt.subplots_adjust(left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=1 / 5.21, top=95 / 100.0) - - dpi = 1000.0 / fighsize - gs = gridspec.GridSpec(2, 1, height_ratios=[8, 2]) - if font == "all_amplicons": - gs = gridspec.GridSpec(2, 1, height_ratios=[5, 4]) - ax = fig.add_subplot(gs[0, 0]) - if font == "large": - plt.title(os.path.basename(amplicon_name), fontsize=28) - elif font != "all_amplicons": - plt.title(os.path.basename(amplicon_name)) - # if font == 'all_amplicons': - # plt.title(os.path.basename(amplicon_name), fontsize=56) - ax2 = ax.twinx() - ax2.set_ylabel("CN") - ax3 = fig.add_subplot(gs[1, 0], sharex=ax) - ax.set_xlim(0, 1) - ax.set_ylabel("Coverage") - ax.yaxis.set_label_coords(-0.05, 0.25) - ax2.yaxis.set_label_coords(1.05, 0.33) - if font == "all_amplicons": - ax.set_ylabel("") - ax2.set_ylabel("") - for b in ilist.offset_breaks(): - ax.axvline(b[0], linestyle=b[1], color="k") - ax3.axvline(b[0], linestyle=b[1], color="k") - - cx = [] - wc = [] - - elist_dict = {} - max_edge = 4 - scale_max_cov = 0 - scale_max_ms = 0 - # msrlist = [self.get_meanshift(i) if i.size() > 50000 else self.meanshift_segmentation(i, window_size=300) for i in ilist] - msrlist = [ - self.get_meanshift(i) if i.size() > 50000 else self.get_meanshift(i, window_size0=300) for i in ilist - ] - sensitive_elist = self.get_sensitive_discordant_edges( - ilist, - msrlist, - eilist, - ms_window_size0=10000, - ms_window_size1=300, - adaptive_counts=True, - amplicon_name=amplicon_name, - ) - eilist = sensitive_elist - - for i, msr in zip(ilist, msrlist): - de = [ - e - for e in eilist - if e[0].v1.pos != -1 and hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i) - ] # self.interval_discordant_edges(i) - elist_dict[i] = de - elist_dict[i].sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - for e in eilist: - eposlist = [] - if e[0].v1.pos != -1: - eposlist.append(hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos)) - if e[0].v2.pos != -1: - eposlist.append(hg.interval(e[0].v2.chrom, e[0].v2.pos, e[0].v2.pos)) - if len(scale_list) == 0 or len(hg.interval_list(eposlist).intersection(scale_list)) > 0: - max_edge = max(max_edge, e[1]) - - for i in ilist: - if i.size() > 1000000: - wc_i = [w for w in self.window_coverage(i, 10000, exact=False)] - - elif i.size() > 100000: - wc_i = [w for w in self.window_coverage(i, 1000, exact=False)] - - else: - wc_i = [w for w in self.window_coverage(i, 150, exact=False)] - - cx += [((i.chrom, (c[0].start + c[0].end) / 2), c[1]) for c in wc_i] - wc += wc_i - - cx0 = [c for c in cx if ilist.xpos(c[0][0], c[0][1]) is not None] - ax.bar( - [ilist.xpos(c[0][0], c[0][1]) for c in cx0], - [c[1] for c in cx0], - 0.0001, - zorder=1, - edgecolor="0.7", - linewidth=1, - ) - # cmax = max([c[1] for c in wc]) - # logging.debug("cmax was " + str(cmax)) - - covl = [] - for i, msr in zip(ilist, msrlist): - for seg in msr: - avg_cov = np.average([c[1] for c in cx0 if c[0][0] == seg.chrom and seg.start <= c[0][1] <= seg.end]) - if len(scale_list) == 0 or len(hg.interval_list([i]).intersection(scale_list)) > 0: - covl += [c[1] for c in cx0 if c[0][0] == seg.chrom and seg.start <= c[0][1] <= seg.end] - scale_max_cov = max(scale_max_cov, avg_cov) - if seg.info["cn"] != float("inf"): - scale_max_ms = max(scale_max_ms, seg.info["cn"]) - else: - scale_max_ms = max(scale_max_ms, 2000) - - ax2.plot( - (ilist.xpos(seg.chrom, max(i.start, seg.start)), ilist.xpos(seg.chrom, min(i.end, seg.end))), - (seg.info["cn"], seg.info["cn"]), - linewidth=4, - color="k", - ) - - logging.debug("Max cov, max ms scales set to: " + str(scale_max_cov) + " " + str(scale_max_ms)) - covl.sort() - if len(covl) > 0: - m95cov = covl[-(len(covl) // 20)] - else: - m95cov = 0 - - if 0 < scale_max_cov < m95cov: - scale_max_ms = scale_max_ms * m95cov / scale_max_cov - scale_max_cov = scale_max_cov * m95cov / scale_max_cov - - y_scale = 3.0 - # y_scale = 2.5 - if font == "all_amplicons": - y_scale = 2.5 - if scale_max_cov > 0: - ax.set_ylim(0.1, y_scale * scale_max_cov) - else: - (ymin, ymax) = ax.get_ylim() - ax.set_ylim(ymin, y_scale * ymax) - if scale_max_ms > 0: - (ymin, ymax) = (0.1, scale_max_ms) - ax2.set_ylim(0.1, y_scale * ymax) - else: - (ymin, ymax) = ax2.get_ylim() - ax2.set_ylim(0.1, y_scale * ymax) - - for i in ilist: - for el in elist_dict[i]: - e = el[0] - if ilist.xpos(e.v2.chrom, e.v2.pos) is None and ilist.xpos(e.v1.chrom, e.v1.pos) is None: - continue - elif ilist.xpos(e.v2.chrom, e.v2.pos) is None: - ax2.axvline( - ilist.xpos(e.v1.chrom, e.v1.pos), - color=ecolor[e.type()], - linewidth=4.0 * min(1, float(el[1]) / max_edge), - alpha=0.5, - zorder=10, - ) - ax2.plot( - (ilist.xpos(e.v1.chrom, e.v1.pos), ilist.xpos(e.v1.chrom, e.v1.pos) - 0.01 * e.v1.strand), - (0, 0), - linewidth=8.0 * min(1, float(el[1]) / max_edge), - color=ecolor[e.type()], - ) - elif ilist.xpos(e.v1.chrom, e.v1.pos) is None: - ax2.axvline( - ilist.xpos(e.v2.chrom, e.v2.pos), - color=ecolor[e.type()], - linewidth=4.0 * min(1, float(el[1]) / max_edge), - alpha=0.5, - zorder=10, - ) - ax2.plot( - (ilist.xpos(e.v2.chrom, e.v2.pos), ilist.xpos(e.v2.chrom, e.v2.pos) - 0.01 * e.v2.strand), - (0, 0), - linewidth=8.0 * min(1, float(el[1]) / max_edge), - color=ecolor[e.type()], - ) - else: - xmid = (ilist.xpos(e.v1.chrom, e.v1.pos) + ilist.xpos(e.v2.chrom, e.v2.pos)) / 2 - xdia = abs(ilist.xpos(e.v2.chrom, e.v2.pos) - ilist.xpos(e.v1.chrom, e.v1.pos)) - ydia = (1.0 + xdia) * 3 * ymax - pseudo_edge = breakpoint_edge( - breakpoint_vertex(e.v1.chrom, hg.absPos(e.v1.chrom, e.v1.pos), e.v1.strand), - breakpoint_vertex(e.v1.chrom, hg.absPos(e.v2.chrom, e.v2.pos), e.v2.strand), - ) - ee = Arc( - (xmid, 0), - xdia, - ydia, - fill=False, - linewidth=4.0 * min(1, float(el[1]) / max_edge), - color=ecolor[pseudo_edge.type()], - zorder=4, - theta1=0.1, - theta2=180, - ) - ax2.add_patch(ee) - ax2.plot( - (ilist.xpos(e.v1.chrom, e.v1.pos), ilist.xpos(e.v1.chrom, e.v1.pos) - 0.01 * e.v1.strand), - (0, 0), - linewidth=8.0 * min(1, float(el[1]) / max_edge), - color=ecolor[pseudo_edge.type()], - ) - ax2.plot( - (ilist.xpos(e.v2.chrom, e.v2.pos), ilist.xpos(e.v2.chrom, e.v2.pos) - 0.01 * e.v2.strand), - (0, 0), - linewidth=8.0 * min(1, float(el[1]) / max_edge), - color=ecolor[pseudo_edge.type()], - ) - ax2.axhline(2.0, alpha=0.8, linewidth=0.5, color="r") - - gparity = 0 - ry = 0.60 - ty = 0.65 - ogene_width = 4 - if font == "large": - # ry = 0.85 - # ry = 0.87 - ry = 0.77 - ogene_width = 12 - ogene_plotted = [] - for i in ilist: - glist = hg.interval_list([i]).intersection(hg.oncogene_list) - ogene_plotted += [g[1].info["Name"] for g in glist] - for g in glist: - if font == "large": - ty = 0 - elif font == "all_amplicons": - if gparity == 0: - # ty = -0.1 - ty = -0.07 - else: - # ty = 0.20 - ty = 0.3 - else: - if gparity == 0: - ty = 0 - else: - ty = 0.37 - if font == "large": - ax3.plot( - [ilist.xpos(i.chrom, max(g[1].start, i.start)), ilist.xpos(i.chrom, min(g[1].end, i.end))], - [ry, ry], - "r-", - linewidth=ogene_width, - ) - ax3.text( - (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) - / 2.0, - ty, - g[1].info["Name"], - horizontalalignment="center", - verticalalignment="bottom", - fontsize=28, - zorder=4, - ) - elif font == "all_amplicons": - ogene_width = 36 - ax3.plot( - [ilist.xpos(i.chrom, max(g[1].start, i.start)), ilist.xpos(i.chrom, min(g[1].end, i.end))], - [0.85, 0.85], - "r-", - linewidth=ogene_width, - ) - ax3.text( - (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) - / 2.0, - -0.05 + 0.37 * gparity, - g[1].info["Name"], - horizontalalignment="center", - verticalalignment="bottom", - fontsize=48, - zorder=4, - ) - else: - ax3.plot( - [ilist.xpos(i.chrom, max(g[1].start, i.start)), ilist.xpos(i.chrom, min(g[1].end, i.end))], - [ry, ry], - "r-", - linewidth=ogene_width, - ) - ax3.text( - (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) - / 2.0, - ty, - g[1].info["Name"], - horizontalalignment="center", - verticalalignment="bottom", - ) - gparity = (gparity + 1) % 2 - for s in segments: - if not i.intersects(s): - continue - ss = i.intersection(s) - ax3.add_patch( - Rectangle( - [ilist.xpos(i.chrom, max(ss.start, i.start)), 0.65], - ilist.xpos(i.chrom, min(ss.end, i.end)) - ilist.xpos(i.chrom, max(ss.start, i.start)), - 0.25, - fc=chrcolor[s.info[1]], - ec="k", - ) - ) - if font == "large": - ax3.text( - (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, - 0, - s.info[0], - horizontalalignment="center", - verticalalignment="bottom", - fontsize=28, - ) - elif font == "large" or font == "all_amplicons": - ax3.text( - (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, - 0, - s.info[0], - horizontalalignment="center", - verticalalignment="bottom", - fontsize=48, - ) - else: - ax3.text( - (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, - 0.2 + int(s[0]) % 2 * 0.15, - s.info[0], - horizontalalignment="center", - verticalalignment="top", - ) - # ax3.text((xpos(max(s[1].start, i.start)) + xpos(min(s[1].end, i.end)))/2.0, 0.2+0%2*0.15, s[0], horizontalalignment='center', verticalalignment='top') - - if font == "large" or font == "all_amplicons": - axyticks = ax.get_yticks() - ax.set_yticks([0, axyticks[-1]]) - ax2yticks = ax2.get_yticks() - ax2.set_yticks([0, ax2yticks[-1]]) - - ax.xaxis.set_visible(False) - ax2.xaxis.set_visible(False) - ax3.yaxis.set_visible(False) - ax.spines["left"].set_visible(False) - ax.spines["right"].set_visible(False) - ax.spines["top"].set_visible(False) - ax2.spines["left"].set_visible(False) - ax2.spines["right"].set_visible(False) - ax2.spines["top"].set_visible(False) - ax3.spines["left"].set_visible(False) - ax3.spines["right"].set_visible(False) - ax2.spines["bottom"].set_linewidth(4) - ax3.tick_params("x", length=0, which="major") - ax3.tick_params("x", length=5, which="minor") - if font == "all_amplicons": - previous_chrom = "" - chrom_index = 1 - interval_poslist = [] - for i in ilist: - if i.chrom != previous_chrom: - chrom_index = 1 - else: - chrom_index += 1 - previous_chrom = i.chrom - imin = ilist.xpos(i.chrom, i.start) - imax = ilist.xpos(i.chrom, i.end) - segname = "" - if imax - imin > 0.2: - segname = i.chrom + "." + str(chrom_index) - elif imax - imin > 0.05: - segname = i.chrom.strip("chr") + "." + str(chrom_index) - elif imax - imin > 0.02: - segname = i.chrom.strip("chr") - interval_poslist.append((segname, (imax + imin) / 2)) - ax3.xaxis.set_major_locator(ticker.FixedLocator([c[1] for c in interval_poslist])) - ax3.xaxis.set_major_formatter(ticker.FixedFormatter([c[0] for c in interval_poslist])) - else: - chrmin = {} - chrmax = {} - for i in ilist: - if i.chrom not in chrmin: - chrmin[i.chrom] = ilist.xpos(i.chrom, i.start) - chrmax[i.chrom] = ilist.xpos(i.chrom, i.end) - else: - chrmin[i.chrom] = min(ilist.xpos(i.chrom, i.start), chrmin[i.chrom]) - chrmax[i.chrom] = max(ilist.xpos(i.chrom, i.end), chrmax[i.chrom]) - chrposlist = [] - for c in chrmin: - chrposlist.append((c if chrmax[c] - chrmin[c] > 0.10 else c.strip("chr"), (chrmin[c] + chrmax[c]) / 2)) - ax3.xaxis.set_major_locator(ticker.FixedLocator([c[1] for c in chrposlist])) - ax3.xaxis.set_major_formatter(ticker.FixedFormatter([c[0] for c in chrposlist])) - xposlist = [] - if font != "all_amplicons": - for i in ilist: - xposlist.append((str(i.start), ilist.xpos(i.chrom, i.start))) - xposlist.append((str(i.end), ilist.xpos(i.chrom, i.end))) - ax3.xaxis.set_minor_locator(ticker.FixedLocator([c[1] for c in xposlist])) - ax3.xaxis.set_minor_formatter(ticker.FixedFormatter([c[0] for c in xposlist])) - plt.setp(ax3.xaxis.get_minorticklabels(), rotation=90) - ax3.tick_params(axis="x", which="minor", pad=15) - # ax3.tick_params(axis='x', which='minor', pad=-5) - ax3.yaxis.set_major_formatter(ticker.NullFormatter()) - ax3.set_ylim(0, 1) - - # ax3.spines['bottom'].set_visible(False) - # ax3.xaxis.set_visible(False) - - fig.subplots_adjust(hspace=0) - try: - fig.savefig(amplicon_name + ".png", dpi=dpi) - fig.savefig(amplicon_name + ".pdf", dpi=dpi) - except np.linalg.linalg.LinAlgError: - logging.error("Numpy LinAlgError when forming amplicon plot! Cannot save " + amplicon_name + " image\n") - - plt.close() diff --git a/bin/breakpoint_graph.py b/bin/breakpoint_graph.py deleted file mode 100755 index 57ae5d21..00000000 --- a/bin/breakpoint_graph.py +++ /dev/null @@ -1,983 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - -import sys - -from collections import defaultdict -import heapq -import logging - -from abstract_graph import * -import ref_util as hg - -cycle_logger = logging.getLogger("cycle") - - -class breakpoint_vertex(abstract_vertex): - """Class representing breakpoint vertex derived from abstract_graph.abstract_vertex - - Attributes: - chrom = chromosome name - pos = 1-based chromosomal location - strand = 1/-1 for forward/reverse strand - vid = (optional)id of vertex - graph = (optional) graph to which vertex belongs""" - - def __init__(self, chrom="", pos=-2, strand=1, vid=0, graph=None): - """2 ways to initialize: - 1) chrom: breakpoint_vertex string in the format chrom:pos("+"/"-"") - 2) chrom, pos, strand: name(STR), pos (INT), strand("+"/"-"")""" - if pos == -2: - vstring = chrom - chrom = vstring[: vstring.find(":")] - pos = int(vstring[vstring.find(":") + 1 : -1]) - strand = 1 if vstring[-1] == "+" else -1 - if graph is not None and graph.has_vertex(chrom, pos, strand): - raise Exception("Duplicate vertex added") - abstract_vertex.__init__(self, vid, graph) - self.chrom = chrom - self.pos = pos - self.strand = strand - - def __repr__(self): - """String format chrom:pos(+/-)""" - if self.strand == 1: - return self.chrom + ":" + str(self.pos) + "+" - else: - return self.chrom + ":" + str(self.pos) + "-" - - def __hash__(self): - return str(self).__hash__() - - def __gt__(self, y): - """Order vertices by absolute position (See hg19util.absPos) + strand""" - return hg.absPos(self.chrom, self.pos) + 0.4 * self.strand > hg.absPos(y.chrom, y.pos) + 0.4 * y.strand - - -class breakpoint_edge(abstract_edge): - """Class representing breakpoint edge derived from abstract_graph.abstract_edge - - Attributes: - v1 = breakpoint_vertex 1 of the edge (recommend using v2 > v1) - v2 = breakpoint_vertex 2 of the edge - edge_type = "discordant"/"breakpoint" or "concordant" : genomic connectivity or source; "sequence": genomic interval - eid = (optional) edge id - graph = (optional) graph to which edge belongs""" - - def __init__( - self, v1, v2=None, eid=0, graph=None, update_vertices=True, edge_type="discordant", hom=None, hom_seq=None - ): - """2 ways to initialize: - 1) v1 = breakpoint_edge string in the format breakpoint_vertex1->breakpoint_vertex2 - 2) v1,v2 = breakpoint_point_vertices - Optional: - eid: edge id - graph: breakpoint_graph - update_vertices: True if vertices edge should be added to vertex neighbor list - edge_type: " = "discordant"/"breakpoint" or "concordant" : genomic connectivity or source; "sequence": genomic interval - Required: - If edge_type = "sequence": v1.chrom = v2.chrom, v1.pos > v2.pos else if equal v1.strand > v2.strand - If edge_type = "concordant": v1.chrom = v2.chrom, |v1.pos - v2.pos| = 1 and the smaller has strand = 1 else -1 - """ - if type(v1) == str: - estr = v1 - v1 = breakpoint_vertex(estr.split(">")[0][:-1]) - v2 = breakpoint_vertex(estr.split(">")[1]) - abstract_edge.__init__(self, v1, v2, eid, graph, update_vertices) - if edge_type in ["concordant", "sequence"]: - if v1.chrom != v2.chrom: - raise Exception("Edge of type " + edge_type + " connects different chromosomes.") - if edge_type in ["concordant", "sequence"]: - if v1.strand == v2.strand: - raise Exception("Edge of type " + edge_type + " connects same strand.") - if edge_type == "concordant": - if (v1.strand == 1 and v1.pos + 1 != v2.pos) or (v2.strand == 1 and v2.pos + 1 != v1.pos): - raise Exception("Edge of type " + edge_type + " connects non-adjacent positions.") - if edge_type == "sequence": - if v1.strand == -1 and v1.pos > v2.pos: - raise Exception( - "Start position for sequence edge greater than end position:" + str(v1) + "->" + str(v2) - ) - if v1.strand == 1 and v2.pos > v1.pos: - raise Exception("Start position for sequence edge greater than end position") - self.edge_type = edge_type - self.hom = hom - self.hom_seq = hom_seq - - def sequence(self, flank_size=-1): - if self.edge_type == "sequence": - seq = hg.interval(self.v1.chrom, self.v1.pos, self.v2.pos).sequence() - if flank_size > 0: - seq = ( - hg.interval(self.v1.chrom, self.v1.pos - flank_size + 1, self.v1.pos).sequence() - + seq - + hg.interval(self.v2.chrom, self.v2.pos, self.v2.pos + flank_size - 1).sequence() - ) - else: - if self.hom == None: - seq = "N" * 20 - else: - seq = self.hom_seq - if flank_size == -1: - flank_size = 1000 - if flank_size > 0: - if self.hom is not None and self.hom > 0: - hom = self.hom - else: - hom = 0 - if self.edge_type == "source": - if self.v2.strand == -1: - right_seq = hg.interval( - self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1 - ).sequence() - left_seq = "" - else: - left_seq = hg.interval( - self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom - ).sequence() - right_seq = "" - elif self.v1.strand == 1: - left_seq = hg.interval( - self.v1.chrom, self.v1.pos - hom - flank_size + 1, self.v1.pos - hom - ).sequence() - if self.v2.strand == -1: - right_seq = hg.interval( - self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1 - ).sequence() - else: - right_seq = hg.interval( - self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom, strand=-1 - ).sequence() - else: - right_seq = hg.interval( - self.v1.chrom, self.v1.pos + hom, self.v1.pos + hom + flank_size - 1 - ).sequence() - if self.v2.strand == -1: - left_seq = hg.interval( - self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1, strand=-1 - ).sequence() - else: - left_seq = hg.interval( - self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom - ).sequence() - seq = left_seq + seq + right_seq - return seq - - def kmer_homology(self, k=10, span=100): - """Number of shared k-mers within "span" distance on either side of vertex positions""" - seq1 = "".join( - [ - a.capitalize() - for a in hg.interval( - self.v1.chrom, - max(1, self.v1.pos - span), - min(self.v1.pos + span, hg.chrLen[hg.chrNum(self.v1.chrom)]), - self.v1.strand, - ).sequence() - ] - ) - seq2 = "".join( - [ - a.capitalize() - for a in hg.interval( - self.v2.chrom, - max(1, self.v2.pos - span), - min(self.v2.pos + span, hg.chrLen[hg.chrNum(self.v2.chrom)]), - -1 * self.v2.strand, - ).sequence() - ] - ) - kset1 = set([seq1[i : i + 10] for i in range(len(seq1) - k + 1)]) - kset2 = set([seq2[i : i + 10] for i in range(len(seq2) - k + 1)]) - return len(kset1.intersection(kset2)) - - def type(self, min_insert=0, max_insert=500): - """Determine type of "breakpoint"/"discordant edge - Output values: - "source": Contains v.pos = -1, indicates end of linear contig. - "interchromosomal": Different chromosomes. - "everted": Forward strand of larger position connected to reverse strand of reverse, indicated by outward orientation of read-pairs, may suggest tandem duplication. - "forward": Both vertex/paired-reads map to forward strand - "reverse": Both vertex/paired-reads map to reverse strand - "discordant": Alignment distance larger/smaller than max/min insert, may indicate deletion - "concordant": Expected alignment length between min and max insert. NOTE: Different from edge_type - """ - if self.v1.pos == -1 or self.v2.pos == -1: - return "source" - elif self.v1.chrom != self.v2.chrom: - return "interchromosomal" - elif self.v1.pos <= self.v2.pos: - vmin = self.v1 - vmax = self.v2 - else: - vmin = self.v2 - vmax = self.v1 - if vmax.strand == 1 and vmin.strand == -1: - return "everted" - if vmax.pos == vmin.pos and vmax.strand != vmin.strand: - return "everted" - if vmax.strand == 1 and vmin.strand == 1: - return "forward" - if vmax.strand == -1 and vmin.strand == -1: - return "reverse" - if vmax.pos - vmin.pos > max_insert or vmax.pos - vmin.pos < min_insert: - return "discordant" - return "concordant" - - def __repr__(self): - """breakpoint_vertex1->breakpoint_vertex2""" - return str(self.v1) + "->" + str(self.v2) - - def __lt__(self, other): - return min((self.v1.chrom, self.v1.pos), (self.v2.chrom, self.v2.pos)) < min( - (other.v1.chrom, self.v1.pos), (other.v2.chrom, self.v2.pos) - ) - - -class breakpoint_graph(abstract_graph): - """Class representing breakpoint edge derived from abstract_graph.abstract_graph""" - - def __init__(self, graphfile=None): - """Creates an empty graph if no graphfile provided - Loads graph from graph file in format defined in load_graphfile""" - abstract_graph.__init__(self) - self.vhash = {} - if graphfile is not None: - self.load_graphfile(graphfile) - - def has_vertex(self, chrom, pos, strand): - vtemp = breakpoint_vertex(chrom, pos, strand) - if vtemp.__hash__() in self.vhash: - return self.vhash[vtemp.__hash__()] - else: - return None - - def new_vertex(self, chrom, pos, strand): - """Create, add and return new breakpoint_vertex if similar vertex not already present""" - v = self.has_vertex(chrom, pos, strand) - if v is not None: - return v - v = breakpoint_vertex(chrom, pos, strand, graph=self) - self.vhash[v.__hash__()] = v - return v - - def new_edge(self, v1, v2, edge_type="discordant", hom=None, hom_seq=None): - """Create, add and return breakpoint_edge to current graph. Recommend using "add_edge()". "new_edge()" may incorrectly add duplicate edges - Arguments: - v1,v2: breakpoint_vertex (These need to be vertices(objects) from current breakpoint graph) - edge_type = "breakpoint"/"discordant"/"concordant"/"source"/"sequence" """ - return breakpoint_edge(v1, v2, graph=self, edge_type=edge_type, hom=hom, hom_seq=hom_seq) - - def add_vertex(self, v): - """Create and add new vertex to graph if no similar vertex exists""" - return self.new_vertex(v.chrom, v.pos, v.strand) - - def add_edge(self, e, edge_type="discordant"): - """Add and return edge similar e to the graph. If e(object) already belongs to graph, return e. - Checks if corresponding vertices already present else return None. - If edge_type not defined, then inherits e.edge_type. - """ - if e.edge_type is not None: - edge_type = e.edge_type - if e.graph is not self: - v1 = self.has_vertex(e.v1.chrom, e.v1.pos, e.v1.strand) - v2 = self.has_vertex(e.v2.chrom, e.v2.pos, e.v2.strand) - if v1 is None or v2 is None: - return None - return self.new_edge(v1, v2, edge_type=edge_type, hom=e.hom, hom_seq=e.hom_seq) - return e - - def load_graphfile(self, graphfile): - """Load breakpoint_graph from file - Format: edge_type edge_string edge_copycount - """ - graphfile_handle = open(graphfile) - ll = [l.strip().split() for l in graphfile_handle] - graphfile_handle.close() - self.copy_count = defaultdict(lambda: 0, {}) - for l in ll: - if len(l) == 0: - continue - if l[0] == "sequence": - v1 = self.add_vertex(breakpoint_vertex(l[1])) - v2 = self.add_vertex(breakpoint_vertex(l[2])) - e = self.new_edge(v1, v2, edge_type="sequence") - self.copy_count[e] = float(l[3]) - if l[0] == "concordant": - e = self.add_edge(breakpoint_edge(l[1], edge_type=l[0])) - self.copy_count[e] = float(l[2]) - if l[0] == "source" or l[0] == "discordant" or l[0] == "breakpoint": - e = self.add_edge(breakpoint_edge(l[1], edge_type="discordant")) - self.copy_count[e] = float(l[2]) - return - - def djikstra_distance(self, v1, v2, min_count=0): - """Find shortest genomic path and distance between genomic locations (including strand) in the breakpoint graph with copy count = min_count. - Return none if not found - Return format: - (distance, path, traversal_copy_count) - distance: INT describing number of base-pairs in intermediate region - path: list of alternating (sequence edge, strand(1/-1)) and (breakpoint edge, strand(1,-1)) such that sequence edges in first/last entries contain v1/v2 - """ - for e in self.es.values(): - if e.v1.chrom == v1.chrom and e.v1.pos <= v1.pos and v1.pos <= e.v2.pos: - e1 = e - if e.v1.chrom == v2.chrom and e.v1.pos <= v2.pos and v2.pos <= e.v2.pos: - e2 = e - if self.copy_count[e1] < min_count or self.copy_count[e2] < min_count: - return None - if v1.strand == v2.strand and e1 == e2 and (v2.pos - v1.pos) * v1.strand > 0: - return (abs(v1.pos - v2.pos - 1), [(e1, v1.strand)], self.copy_count[e1]) - if v1.strand == 1: - distance = e1.v2.pos - v1.pos - else: - distance = v1.pos - e1.v1.pos - a = [(distance, [(e1, v1.strand)], self.copy_count[e1])] - heapq.heapify(a) - while len(a) > 0: - d, path, cc = heapq.heappop(a) - e, s = path[-1] - if s == 1: - e_new = e.v2.elist - v = e.v2 - else: - e_new = e.v1.elist - v = e.v1 - e_new = [e_next for e_next in e_new if e_next.edge_type != "sequence"] - e_search = [] - for en in e_new: - min_c = min(cc, self.copy_count[en]) - if min_c < min_count: - continue - if v == en.v1: - en_strand = 1 - v_seq = en.v2 - else: - en_strand = -1 - v_seq = en.v1 - if (en, en_strand) in path: - continue - if (en, -1 * en_strand) in path: - min_c = min(min_c, self.copy_count[en] / 2.0) - if min_c < min_count: - continue - en_seq, en_seqstrand = [ - (es, 1 if v_seq == es.v1 else -1) for es in v_seq.elist if es.edge_type == "sequence" - ][0] - min_c = min(min_c, self.copy_count[en_seq]) - if min_c < min_count: - continue - if (en_seq, en_seqstrand) in path and not (en_seq == e1 and e1 == e2 and en_seqstrand == v1.strand): - continue - if (en_seq, -1 * en_seqstrand) in path: - min_c = min(self.copy_count[en_seq] / 2.0, min_c) - if min_c < min_count: - continue - if en_seq == e2 and v2.strand == en_seqstrand: - if v2.strand == 1: - dd = d + v2.pos - e2.v1.pos - else: - dd = d + e2.v2.pos - v2.pos - return (dd, path + [(en, en_strand), (en_seq, en_seqstrand)], min_c) - heapq.heappush( - a, (d + en_seq.v2.pos - en_seq.v1.pos + 1, path + [(en, en_strand), (en_seq, en_seqstrand)], min_c) - ) - return None - - def cycle_decomposition(self, w, s): - """ - Decompose breakpoint_graph into 'simple' cycles. - Simple cycles may contain a sequence edge atmost once along each strand. - Reports maximum parsimonious cycles starting from thickest cycle until 80% of genomic content is covered. - w is dict containing weights (counts) of edges - s is source vertex, this vertex has the exception of not having a sequence edge attached""" - - def thickest_cycle(hce, wehc): - # print hce, wehc - v1 = hce[1].v1 - a = [(-1 * hce[0], v1)] - heapq.heapify(a) - hdict = {v1: (hce[0], [hce[1]], None, set())} - seenSet = set() - seenEdges = set() - completed = False - while len(a) > 0 and not completed: - # print len(a), str(a[0]), str(hdict[a[0][1]]) - v1w, v1 = heapq.heappop(a) - if v1 == hce[1].v1 and v1 in seenSet: - completed = True - break - for e in v1.elist: - if e.edge_type == "sequence": - continue - else: - v2 = e.neighbor(v1) - if v2 == s: - v3 = v2 - if e in hdict[v1][3]: - nw = min(hdict[v1][0], wehc[e] / 2) - else: - nw = min(hdict[v1][0], wehc[e]) - if not v3 in hdict or hdict[v3][2] is None or hdict[v3][0] < nw: - nhdict = hdict[v1][3].copy() - nhdict.add(e) - hdict[v3] = (nw, [e], v1, nhdict) - seenEdges.add(e) - else: - for e2 in v2.elist: - if e2.edge_type == "sequence": - se = e2 - v3 = e2.neighbor(v2) - break - if e in hdict[v1][3]: - # print 'e is seen', e, seenEdges - nw = min(hdict[v1][0], wehc[e] / 2) - elif se in hdict[v1][3]: - # print 'se is seen', se, seenEdges - nw = min(hdict[v1][0], wehc[e], wehc[se] / 2) - else: - nw = min(hdict[v1][0], wehc[e]) - if not v3 in hdict or hdict[v3][2] is None or hdict[v3][0] < nw: - nhdict = hdict[v1][3].copy() - nhdict.add(e) - nhdict.add(se) - hdict[v3] = (nw, [e, se], v1, nhdict) - # print 'seen edges', e, se, v3, hdict[v3] - seenEdges.add(e) - seenEdges.add(se) - if v3 in seenSet: - continue - seenSet.add(v3) - heapq.heappush(a, (-1 * hdict[v3][0], v3)) - if len(a) == 0 and not completed: - print("NOT COMPLETED", hce[1].v1) - s2Set = set() - tc = hdict[hce[1].v1][1] - v2 = hdict[hce[1].v1][2] - while v2 != hce[1].v1: # and not v2 in s2Set: - # print hce[1].v1, v2, s2Set - s2Set.add(v2) - if v2 not in hdict: - print(str(v2), str(hce[1].v1), str(tc)) - for ee in hce[1].v1.elist: - print(str(ee), wehc[ee]) - tc = hdict[v2][1] + tc - v2 = hdict[v2][2] - s2Set.add(v2) - # print v2, tc - return tc, hdict[hce[1].v1][0] - - total_amplicon_content = sum([(e.v2.pos - e.v1.pos) * w[e] for e in w if e.edge_type == "sequence"]) - amplicon_content_covered = 0 - w2 = w.copy() - cycle_number = 1 - cycle_list = [] - while max(w2.values()) > 0.1: - we = [(w2[e], e) for e in w2] - we.sort() - wer = we[::-1] - we = wer - wei = 0 - tcwmax = -1 - tcmax = None - tchwmax = -1 - tchmax = None - tchw = -1 - - # print "EEEEEEEEEEEEEE", len(w2) - # for e in w2: - # print "EEEEEEEEEEEE", str(e), e.edge_type, w2[e] - # print "EEEEEEEEE========================" - while wei < len(we): # and (tcwmax == -1 or we[wei][0] >= tcwmax / 2.0): - # if we[wei][1].edge_type == 'sequence': - # wei += 1 - # continue - if w2[we[wei][1]] < 0.1: - wei += 1 - continue - tc, tcw = thickest_cycle(we[wei], w2) - if len(tc) < 2: - print(str(tc[0])) - exit() - if tcw > tcwmax: - tcmax = tc - tcwmax = tcw - # sumlen = sum([abs(e.v1.pos - e.v2.pos) for e in tc if e.edge_type == 'sequence']) - # if sumlen * tcw > tchwmax: - # tchwmax = sumlen * tcw - # tchmax = tc - # tchw = tcw - wei += 1 - if tcwmax == -1: - break - tc = tcmax - tcw = tcwmax - # tc = tchmax - # tcw = tchw - if -1 in [e.v1.pos for e in tc] + [e.v2.pos for e in tc]: - csource = 0 - for ci in range(len(tc) - 1): - if -1 in [tc[ci].v1.pos, tc[ci].v2.pos] and -1 in [tc[ci + 1].v1.pos, tc[ci + 1].v2.pos]: - csource = ci + 1 - tc = tc[ci + 1 :] + tc[0 : ci + 1] - break - if tc[0].v1 == tc[1].v1 or tc[0].v1 == tc[1].v2: - v2 = tc[0].v1 - v1 = tc[0].v2 - else: - v2 = tc[0].v2 - v1 = tc[0].v1 - for ci in range(len(tc)): - if tc[ci].v1.pos == v1.pos: - v2 = tc[ci].v2 - else: - v2 = tc[ci].v1 - if tc[ci].edge_type == "sequence": - if v1.pos > v2.pos: - tc = tc[::-1] - break - v1 = v2 - else: - if tc[0].v1 == tc[1].v1 or tc[0].v1 == tc[1].v2: - v2 = tc[0].v1 - v1 = tc[0].v2 - else: - v2 = tc[0].v2 - v1 = tc[0].v1 - for ci in range(len(tc)): - if tc[ci].v1.pos == v1.pos: - v2 = tc[ci].v2 - else: - v2 = tc[ci].v1 - if tc[ci].edge_type == "sequence": - if v1.pos > v2.pos: - tc = tc[ci::-1] + tc[:ci:-1] - break - v1 = v2 - ci = 0 - while tc[ci].type() == "concordant" or tc[ci - 1].type() == "concordant": - ci -= 1 - tc = tc[ci:] + tc[:ci] - - if tcw == 0: - print("tcw is 0") - break - print("Cycle ", cycle_number, ": Copy count = ", tcw, tc) - cycle_edge_list = [] - ci = 1 - v0 = None - v0c = None - if tc[0].v1 == tc[1].v1 or tc[0].v1 == tc[1].v2: - v2 = tc[0].v1 - v1 = tc[0].v2 - else: - v2 = tc[0].v2 - v1 = tc[0].v1 - if tc[0].edge_type == "sequence": - v0 = v1 - v0c = v2 - elif v1.pos == -1 or v2.pos == -1: - print(v1, "->", v2) - cycle_edge_list.append((v1, v2)) - v1 = v2 - while ci < len(tc): - if (tc[ci].v1.chrom, tc[ci].v1.pos, tc[ci].v1.strand) == (v1.chrom, v1.pos, v1.strand): - v2 = tc[ci].v2 - else: - v2 = tc[ci].v1 - if v1.pos == -1 or v2.pos == -1: - if v0 is not None: - print(v0, "->", v0c) - cycle_edge_list.append((v0, v0c)) - print(v1, "->", v2) - cycle_edge_list.append((v1, v2)) - v0 = None - v0c = None - elif tc[ci].edge_type == "sequence": - if v0 is None: - v0 = v1 - v0c = v2 - else: - v0c = v2 - elif tc[ci].type() != "concordant": - if v0 is not None: - print(v0, "->", v0c) - cycle_edge_list.append((v0, v0c)) - v0 = None - v0c = None - v1 = v2 - ci += 1 - if v0 is not None: - print(v0, "->", v0c) - cycle_edge_list.append((v0, v0c)) - if amplicon_content_covered <= 0.9 * total_amplicon_content or (tcw > 0.2 * cycle_list[0][1]): - cycle_list.append([cycle_number, tcw, tc, cycle_edge_list]) - acc = tcw * sum([abs(e[1].pos - e[0].pos) for e in cycle_edge_list if -1 not in [e[0].pos, e[1].pos]]) - amplicon_content_covered += acc - cycle_number += 1 - # print tcw, tc - for e in tc: - w2[e] = w2[e] - tcw - # if w2[e] == 0.0: - # w2.pop(e) - if amplicon_content_covered > total_amplicon_content: - break - - segment_list = [] - for c in cycle_list: - max_segment = c[3][0] - max_orientation = "+" - max_segi = 0 - segi = 0 - for e in c[3]: - if (-1 in (max_segment[0].pos, max_segment[1].pos) and -1 not in (e[0].pos, e[1].pos)) or ( - abs(e[0].pos - e[1].pos) >= abs(max_segment[0].pos - max_segment[1].pos) - ): - max_segment = e - max_segi = segi - if e[0].pos + 0.4 * e[0].strand <= e[1].pos + 0.4 * e[1].strand: - max_orientation = "+" - else: - max_orientation = "-" - if e[0].pos + 0.4 * e[0].strand <= e[1].pos + 0.4 * e[1].strand: - if e not in segment_list: - segment_list.append(e) - else: - if (e[1], e[0]) not in segment_list: - segment_list.append((e[1], e[0])) - segi += 1 - if max_orientation == "+": - c[3] = c[3][max_segi:] + c[3][:max_segi] - else: - c[3] = [(e[1], e[0]) for e in c[3][: max_segi + 1][::-1] + c[3][max_segi + 1 :][::-1]] - - segment_list.sort() - segi = 1 - segment_index = {} - for s in [ss for ss in segment_list if ss[0].pos != -1 and ss[1].pos != -1]: - segment_index[s] = segi - segi += 1 - cycle_logger.info("List of cycle segments") - for s in [ss for ss in segment_list if ss[0].pos == -1 or ss[1].pos == -1]: - segment_index[s] = 0 - for s in [ss for ss in segment_list if ss[0].pos != -1 and ss[1].pos != -1]: - cycle_logger.info( - "Segment\t" + "\t".join([str(segment_index[s]), s[0].chrom, str(s[0].pos), str(s[1].pos)]) - ) - for c in cycle_list: - seglist = [] - orientation_list = [] - for e in c[3]: - if e in segment_index: - seglist.append(segment_index[e]) - orientation_list.append("+") - else: - seglist.append(segment_index[(e[1], e[0])]) - orientation_list.append("-") - cycle_logger.info( - "Cycle=" - + str(c[0]) - + ";Copy_count=" - + str(c[1]) - + ";Segments=" - + ",".join([str(e[0]) + str(e[1]) for e in zip(seglist, orientation_list)]) - ) - - return None - - def __repr__(self): - return "/n".join(map(str, self.vs.values() + self.es.values())) + "\n" - - -class graph_decomposition(object): - """Class represents decomposition of a breakpoint_graph with balanced edge counts into cycles/walks - Provides methods to merge and modify cycles into larger walks to represent architecture of complex rearrangements. - """ - - def __init__(self, segment_list=None, cycle_list=None, ilist=None, file=None, file_content=None): - if file is not None or file_content is not None: - self.segment_list = hg.interval_list([]) - self.segment_dict = {} - self.cycle_dict = {} - self.ilist = hg.interval_list([]) - - if file_content: - lines = file_content.split("\n") - else: - lines = str(open(file).read().decode()).split("\n") - ll = [l.strip().split() for l in lines if len(l.strip()) > 0] - for l in ll: - if "Segment" == l[0]: - s = hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]]) - self.segment_dict[l[1]] = s - self.segment_list.append(s) - elif "Cycle=" in l[0]: - ls = l[0].split(";") - ci = ls[0].split("=")[1] - cn = float(ls[1].split("=")[1]) - cl = [] - for s in ls[2].split("=")[1].split(","): - if s[-1] == "+": - cl.append((s[:-1], 1)) - else: - cl.append((s[:-1], -1)) - self.cycle_dict[ci] = (ci, cn, cl) - elif "Interval" == l[0]: - self.ilist.append(hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]])) - elif cycle_list is None: - segment_set = hg.interval_list( - [hg.interval(ss[0], ss[1], ss[2]) for ss in {(s.chrom, s.start, s.end) for s in segment_list}] - ) - segment_set.sort() - self.segment_list = segment_set - self.segment_dict = {} - seg_id = {} - cl = [] - for s in enumerate(segment_set): - self.segment_dict[str(s[0] + 1)] = s[1] - seg_id[(s[1].chrom, s[1].start, s[1].end)] = str(s[0] + 1) - for s in segment_list: - cl.append((seg_id[(s.chrom, s.start, s.end)], s.strand)) - for ii in range(len(self.segment_list)): - s = self.segment_list[ii] - s.info = [seg_id[(s.chrom, s.start, s.end)]] - self.cycle_dict = {"1": ("1", 1, cl)} - self.ilist = hg.interval_list([s[0] for s in segment_set.merge_clusters(extend=1)]) - for ii in range(len(self.ilist)): - self.ilist[ii].info = [str(ii)] - else: - self.segment_list = segment_list - self.segment_dict = {s.info[0]: s for s in segment_list} - self.cycle_dict = {c[0]: c for c in cycle_list} - if ilist is not None: - self.ilist = ilist - else: - self.ilist = hg.interval_list([s[0] for s in segment_list.merge_clusters(extend=1)]) - for ii in range(len(self.ilist)): - self.ilist[ii].info = [str(ii)] - - def next_seg_id(self): - mi = 0 - for i in self.segment_dict: - if int(i) > mi: - mi = int(i) - return str(mi + 1) - - def next_cycle_id(self): - mi = 1 - while str(mi) in self.cycle_dict: - mi += 1 - return str(mi) - - def merge(self, c1, c2, si1, si2): - cycle1 = self.cycle_dict[c1] - cycle2 = self.cycle_dict[c2] - # check if atmost 1 cycle has source vertex - if "0" in [s[0] for s in cycle1[2]] and "0" in [s[0] for s in cycle2[2]]: - raise Exception("Cannot merge 2 cycles with source vertices") - # if cycle2 has source vertex, exchange c1,c2 - if "0" in [s[0] for s in cycle2[2]]: - (c1, c2, si1, si2, cycle1, cycle2) = (c2, c1, si2, si1, cycle2, cycle1) - if si1 == 0 or si1 == len(cycle1[2]) - 1: - raise Exception("Cannot use source segment for merging") - # check if segments overlap - if not self.segment_dict[cycle1[2][si1][0]].intersects(self.segment_dict[cycle2[2][si2][0]]): - raise Exception( - "Segments do not overlap" - + str(self.segment_dict[cycle1[2][si1][0]]) - + " " - + str(self.segment_dict[cycle2[2][si2][0]]) - ) - # cnlist: (merged cn, cycle1cn, cycle2cn) - if cycle1[1] == 0 or cycle2[1] == 0: - raise Exception("Cycle copy numbers should be > 0 to merge") - if cycle1[1] > cycle2[1]: - cnlist = (cycle2[1], cycle1[1] - cycle2[1], 0.0) - else: - cnlist = (cycle1[1], 0.0, cycle2[1] - cycle1[1]) - seg1 = self.segment_dict[cycle1[2][si1][0]] - seg2 = self.segment_dict[cycle2[2][si2][0]] - seg1_found = False - seg2_found = False - for i in self.segment_list: - if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): - seg1_found = True - ns1 = i.info[0] - overlap1 = (ns1, cycle1[2][si1][1]) - elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): - seg1_found = True - ns1 = i.info[0] - overlap1 = (ns1, cycle1[2][si1][1]) - if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): - seg2_found = True - ns2 = i.info[0] - overlap2 = (ns2, cycle1[2][si1][1]) - elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): - seg2_found = True - ns2 = i.info[0] - overlap2 = (ns2, cycle1[2][si1][1]) - if not seg1_found: - ns1 = self.next_seg_id() - overlap1 = (ns1, cycle1[2][si1][1]) - if cycle1[2][si1][1] == 1: - self.segment_dict[ns1] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns1]) - else: - self.segment_dict[ns1] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns1]) - self.segment_list.append(self.segment_dict[ns1]) - if not seg2_found: - ns2 = self.next_seg_id() - overlap2 = (ns2, cycle1[2][si1][1]) - if cycle1[2][si1][1] == 1: - self.segment_dict[ns2] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns2]) - else: - self.segment_dict[ns2] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns2]) - self.segment_list.append(self.segment_dict[ns2]) - cycle1_init = cycle1[2][:si1] - if not cycle1[2][si1][1]: - (overlap1, overlap2, ns1, ns2) = (overlap2, overlap1, ns2, ns1) - if cycle1[2][si1][1] == cycle2[2][si2][1]: - cycle2_span = cycle2[2][si2 + 1 :] + cycle2[2][:si2] - else: - cycle2_span = [(s[0], -1 * s[1]) for s in cycle2[2][:si2][::-1] + cycle2[2][si2 + 1 :][::-1]] - cycle1_final = cycle1[2][si1 + 1 :] - mcycle = cycle1_init + [overlap1] + cycle2_span + [overlap2] + cycle1_final - mcycle_id = self.next_cycle_id() - self.cycle_dict[mcycle_id] = (mcycle_id, cnlist[0], mcycle) - self.cycle_dict[c1] = (c1, cnlist[1], cycle1[2]) - self.cycle_dict[c2] = (c2, cnlist[2], cycle2[2]) - return - - def pivot(self, c1, si1, si2): - cycle1 = self.cycle_dict[c1] - # check if segments overlap - if not self.segment_dict[cycle1[2][si1][0]].intersects(self.segment_dict[cycle1[2][si2][0]]): - raise Exception("Segments do not overlap") - # check if segments have opposite orientation - if cycle1[2][si1][1] == cycle1[2][si2][1]: - raise Exception("Segments should be in opposite orientation") - seg1 = self.segment_dict[cycle1[2][si1][0]] - seg2 = self.segment_dict[cycle1[2][si2][0]] - seg1_found = False - seg2_found = False - for i in self.segment_list: - if (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): - seg1_found = True - ns1 = i.info[0] - overlap1 = (ns1, cycle1[2][si1][1]) - if (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): - seg2_found = True - ns2 = i.info[0] - overlap2 = (ns2, cycle1[2][si2][1]) - if not seg1_found: - ns1 = self.next_seg_id() - overlap1 = (ns1, cycle1[2][si1][1]) - self.segment_dict[ns1] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns1]) - self.segment_list.append(self.segment_dict[ns1]) - if not seg2_found: - ns2 = self.next_seg_id() - overlap2 = (ns2, cycle1[2][si2][1]) - self.segment_dict[ns2] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns2]) - self.segment_list.append(self.segment_dict[ns2]) - cycle1_init = cycle1[2][:si1] - if cycle1[2][si1][1] == -1: - (overlap1, overlap2, ns1, ns2) = ( - (overlap2[0], -1 * overlap2[1]), - (overlap1[0], -1 * overlap1[1]), - ns2, - ns1, - ) - cycle1_span = [(s[0], -1 * s[1]) for s in cycle1[2][si1 + 1 : si2][::-1]] - cycle1_final = cycle1[2][si2 + 1 :] - mcycle = cycle1_init + [overlap1] + cycle1_span + [overlap2] + cycle1_final - mcycle_id = self.next_cycle_id() - self.cycle_dict[mcycle_id] = (mcycle_id, cycle1[1], mcycle) - self.cycle_dict[c1] = (c1, 0.0, cycle1[2]) - return - - def fasta_sequence(self, cycle_list=None, outfasta=None): - if cycle_list is None: - ccnlist = [(c[1], c[0]) for c in self.cycle_dict.values()] - ccnlist.sort(reverse=True) - print(ccnlist) - cycle_list = [c[1] for c in ccnlist] - fseq = "" - if outfasta is not None: - outfile = open(outfasta, "w") - for c in cycle_list: - if outfasta is None: - fseq += ( - ">Cycle" - + c - + " Copy_count=" - + str(self.cycle_dict[c][1]) - + ";Segments=" - + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c][2]]) - + "\n" - ) - else: - outfile.write( - ">Cycle" - + c - + " Copy_count=" - + str(self.cycle_dict[c][1]) - + ";Segments=" - + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c][2]]) - + "\n" - ) - for s in self.cycle_dict[c][2]: - if s[0] == "0": - continue - if s[1] == 1: - if outfasta is None: - fseq += self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file) - else: - outfile.write(self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file)) - else: - if outfasta is None: - fseq += hg.reverse_complement(self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file)) - else: - outfile.write(hg.reverse_complement(self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file))) - if outfasta is None: - fseq += "\n" - else: - outfile.write("\n") - if outfasta is not None: - outfile.close() - return fseq - - def __repr__(self): - s = "" - for i in self.ilist: - s += "\t".join(["Interval", i.info[0], i.chrom, str(i.start), str(i.end)]) + "\n" - for i in self.segment_list: - s += "\t".join(["Segment", i.info[0], i.chrom, str(i.start), str(i.end)]) + "\n" - ccnlist = [(c[1], c[0]) for c in self.cycle_dict.values()] - ccnlist.sort(reverse=True) - for c in ccnlist: - s += ( - "Cycle=" - + c[1] - + ";Copy_count=" - + str(c[0]) - + ";Segments=" - + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c[1]][2]]) - + "\n" - ) - return s diff --git a/bin/check_reference.py b/bin/check_reference.py deleted file mode 100755 index 0f047b69..00000000 --- a/bin/check_reference.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - -from collections import defaultdict -import logging -import subprocess -import sys - -# create the set of autosomal chromosome names in various builds. -# should be updated if a reference is added to the data repo with more than 22 autosomes, but not necessary to do so -chrom_range = [str(x) for x in range(1, 23)] -chrom_range.extend(["chr" + x for x in chrom_range]) -chrom_range.append("hpv16ref_1") # use one representative entry from the viral genome collection to catch a viral ref. -chrom_range = set(chrom_range) - - -def get_ref_fname(aa_dr_path, rname): - with open(aa_dr_path + "/" + rname + "/file_list.txt") as infile: - for line in infile: - fields = line.rstrip().rsplit() - if fields[0] == "fa_file": - return fields[1] - - logging.error("ERROR: AA data repo 'file_list.txt' not found!\n") - return None - - -# get a subset of the chromosome names/lengths from a .fai file. -def get_ref_seq_lens(ref_genome_size_file): - chr_sizes = {} - try: - with open(ref_genome_size_file) as infile: - for line in infile: - fields = line.rstrip().rsplit() - if fields[0] in chrom_range: - chr_sizes[fields[0]] = int(fields[1]) - - except IOError: - pass - - return chr_sizes - - -# read bam header and store info -def get_bam_header(bamf, samtools): - cmd = samtools + " view -H " + bamf - return subprocess.check_output(cmd, shell=True).decode("utf-8") - - -# extract sequence lengths and ids -def extract_seq_info(bam_header): - bamSeqLenD = defaultdict(int) - linelist = bam_header.rsplit("\n") - for line in (x for x in linelist if x.startswith("@SQ")): - fields = line.rstrip().rsplit()[1:] - ld = {i.rsplit(":")[0]: i.rsplit(":")[1] for i in fields if ":" in i} - bamSeqLenD[ld["SN"]] = int(ld["LN"]) - - return bamSeqLenD - - -# check if bam matches to a reference genome in terms of length and sequence name -# returns false if the same chromosome has different length in bam vs. reference -# returns false if no chromosome names are shared between bam/reference -# returns true if no shared chromosomes have different lengths and at least one chromosome is present. -def match_ref(bamSeqLenD, ref_len_d): - overlaps = 0 - for chrom, len in ref_len_d.items(): - if bamSeqLenD[chrom] > 0 and len != bamSeqLenD[chrom]: - return False - - elif len == bamSeqLenD[chrom]: - overlaps += 1 - - return overlaps - - -# check properly paired rate on bam file -def check_properly_paired(bamf, samtools): - cmd = samtools + " flagstat {} | grep 'properly paired'".format(bamf) - t = str(subprocess.check_output(cmd, shell=True).decode("utf-8")) - logging.info("\n" + bamf + ": " + t.rstrip()) - ppp = float(t.rsplit("(")[-1].rsplit("%")[0]) - if t.startswith("0 + 0"): - logging.error( - "\nERROR: IMPROPERLY GENERATED BAM FILE! No properly-paired reads were found. The most common " - "reason for this behavior is that the reference genome contained alt contigs that were not " - "indicated to the aligner. You must re-align to use AA (and many other bioinformatic tools) on" - " this data.\n\n" - ) - sys.exit(1) - - elif ppp < 95: - logging.warning( - "WARNING: BAM FILE PROPERLY PAIRED RATE IS BELOW 95%.\nQuality of data may be insufficient for AA " - "analysis. Poorly controlled insert size distribution during sample prep can cause high fractions of read" - " pairs to be marked as discordant during alignment. Artifactual short SVs and long runtimes may occur!" - "\n" - ) - - return ppp - - -# check if the BAM reference matches to sequence names & lengths in a dictionary of .fai files -# returns the name of the reference genome the BAM matches to, or prints error and returns None. -def check_ref(bamf, ref_to_fai_dict, samtools): - bam_header = get_bam_header(bamf, samtools) - bamSeqLenD = extract_seq_info(bam_header) - bestref = None - bestrefhits = 0 - for refName, fai_path in ref_to_fai_dict.items(): - ref_len_d = get_ref_seq_lens(fai_path) - matched = match_ref(bamSeqLenD, ref_len_d) - if matched: - if matched > bestrefhits: - bestref = refName - bestrefhits = matched - - elif bestref and matched == bestrefhits and "_viral" in bestref and "_viral" not in refName: - bestref = refName - bestrefhits = matched - - if bestref: - logging.info("Matched " + bamf + " to reference genome " + bestref) - return bestref - - em1 = "ERROR: Could not match BAM to a known AA reference genome!\n" - em2 = """This may happen if 1) The value provided to optional argument '--ref' does not match the - reference the BAM is aligned to, or 2) The corresponding AA data repo folder for this reference - is not present, or 3) The BAM uses a different chromosome naming convention (e.g. accession - numbers instead of chromosome names). Consider inspecting the header of the BAM file and the AA - data repo directory.\n""" - - logging.error(em1) - logging.error(em2) - sys.stderr.write(em1) - sys.stderr.write(em2) - - return None diff --git a/bin/cnv_prefilter.py b/bin/cnv_prefilter.py deleted file mode 100644 index 495c85e7..00000000 --- a/bin/cnv_prefilter.py +++ /dev/null @@ -1,216 +0,0 @@ -#!/usr/bin/env python - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - -from collections import defaultdict -import logging -import os - -from intervaltree import IntervalTree - - -def merge_intervals(usort_intd, cn_cut=4.5, tol=1, require_same_cn=False, ref=None): - merged_intd = defaultdict(IntervalTree) - for chrom, usort_ints in usort_intd.items(): - # sort ints - sort_ints = sorted( - [x for x in usort_ints if x[2] > cn_cut or (ref == "GRCh38_viral" and not chrom.startswith("chr"))] - ) - if not sort_ints: - continue - - # merge sorted ints - mi = [sort_ints[0]] - for ival in sort_ints[1:]: - pass_cn_check = True - if require_same_cn and not ival[2] == mi[-1][2]: - pass_cn_check = False - - if ival[0] <= mi[-1][1] + tol and pass_cn_check: - ui = (mi[-1][0], max(ival[1], mi[-1][1]), mi[-1][2]) - mi[-1] = ui - - else: - mi.append(ival) - - for x in mi: - merged_intd[chrom].addi(x[0], x[1], x[2]) - - return merged_intd - - -# create an interval list (chrom, start, end, CN) from a dict of interval trees. -def ivald_to_ilist(ivald): - ivals = [] - for chrom, ivalt in ivald.items(): - for ival in ivalt: - ivals.append((chrom, ival.begin, ival.end, ival.data)) - - return ivals - - -# takes list of tuples (chrom, start, end, cn) -def compute_cn_median(cnlist, armlen): - cnsum = sum([x[2] - x[1] for x in cnlist]) - if cnsum < 0.5 * armlen: - return 2.0 - - halfn = cnsum / 2.0 - scns = sorted(cnlist, key=lambda x: x[3]) - rt = 0 - ccn = 0 - for x in scns: - ccn = x[3] - rt += x[2] - x[1] - if rt >= halfn: - break - - return ccn - - -def read_bed(ifname, keepdat=False): - beddict = defaultdict(IntervalTree) - with open(ifname) as infile: - for line in infile: - line = line.rstrip() - if line: - fields = line.rsplit() - s, e = int(fields[1]), int(fields[2]) - if e - s == 0: - logging.warning("Size 0 interval found. Skipping: " + line) - continue - - if keepdat: - beddict[fields[0]].addi(s, e, tuple(fields[3:])) - else: - beddict[fields[0]].addi(s, e) - - return beddict - - -# read regions to split on/filter into dictionary of interval trees, where keys are chromosomes -def read_gain_regions(ref): - AA_DATA_REPO = os.environ["AA_DATA_REPO"] + "/" + ref + "/" - fdict = {} - with open(AA_DATA_REPO + "file_list.txt") as infile: - for line in infile: - line = line.rstrip() - if line: - fields = line.rsplit() - fdict[fields[0]] = fields[1] - - grf = AA_DATA_REPO + fdict["conserved_regions_filename"] - gain_regions = read_bed(grf) - - return gain_regions - - -def get_continuous_high_regions(bedfile, cngain): - raw_input = defaultdict(list) - with open(bedfile) as infile: - for line in infile: - fields = line.rstrip().rsplit("\t") - c, s, e = fields[0], int(fields[1]), int(fields[2]) + 1 - cn = float(fields[-1]) - raw_input[c].append((s, e, cn)) - - return merge_intervals(raw_input, cn_cut=cngain, tol=300000) - - -# take CNV calls (as bed?) - have to update to not do CNV_GAIN -# input bed file, centromere_dict -# output: path of prefiltered bed file -def prefilter_bed(bedfile, ref, centromere_dict, chr_sizes, cngain, outdir): - # interval to arm lookup - region_ivald = defaultdict(IntervalTree) - for key, value in chr_sizes.items(): - try: - cent_tup = centromere_dict[key] - region_ivald[key].addi(0, int(cent_tup[0]), key + "p") - region_ivald[key].addi(int(cent_tup[1]), int(value), key + "q") - - # handle mitochondrial contig or other things (like viral genomes) - except KeyError: - region_ivald[key].addi(0, int(value), key) - - # store cnv calls per arm - arm2cns = defaultdict(list) - arm2lens = defaultdict(int) - with open(bedfile) as infile: - for line in infile: - fields = line.rstrip().rsplit("\t") - c, s, e = fields[0], int(fields[1]), int(fields[2]) + 1 - if c == "hs37d5": - continue - - cn = float(fields[-1]) - a = region_ivald[c][(s + e) // 2] - if not a: - a = region_ivald[c][s:e] - - if a: - carm_interval = a.pop() - carm = carm_interval.data - arm2cns[carm].append((c, s, e, cn)) - arm2lens[carm] = carm_interval.end - carm_interval.begin - - else: - arm2cns["other"].append((c, s, e, cn)) - logging.debug("Did not match " + c + ":" + str(s) + "-" + str(e) + " to a known chromosome arm!") - - continuous_high_region_ivald = get_continuous_high_regions(bedfile, cngain) - cn_filt_entries = [] - for a in sorted(arm2cns.keys()): - # compute the median CN of the arm - init_cns = arm2cns[a] - med_cn = compute_cn_median(init_cns, arm2lens[a]) - for x in init_cns: - long_seed_region_penalty_mult = 1.0 - # ignore CN segments over 30 Mbp - if x[2] - x[1] > 30000000: - continue - - # penalize segments over 20 Mbp - elif x[2] - x[1] > 20000000: - long_seed_region_penalty_mult = 2.0 - - continuous_high_hits = continuous_high_region_ivald[x[0]][x[1] : x[2]] - if continuous_high_hits: - for y in continuous_high_hits: - # penalize seeds that overlap a high-CN region of 10 Mbp or more - if y.end - y.begin > 10000000: - long_seed_region_penalty_mult = max(1.5, long_seed_region_penalty_mult) - - ccg = cngain * long_seed_region_penalty_mult - if x[3] > med_cn + ccg - 2: - cn_filt_entries.append(x) - - elif ref == "GRCh38_viral" and not x[0].startswith("chr") and x[3] >= 1: - cn_filt_entries.append(x) - - gain_regions = read_gain_regions(ref) - # now remove regions based on filter regions - filt_ivald = defaultdict(IntervalTree) - for x in cn_filt_entries: - cit = IntervalTree() - cit.addi(x[1], x[2]) - bi = gain_regions[x[0]] - for y in bi: - cit.slice(y.begin) - cit.slice(y.end) - - for p in sorted(cit): - filt_ivald[x[0]].addi(p[0], p[1], x[3]) - - merged_filt_ivald = merge_intervals(filt_ivald, cn_cut=cngain, require_same_cn=True, ref=ref) - final_filt_entries = ivald_to_ilist(merged_filt_ivald) - bname = outdir + "/" + bedfile.rsplit("/")[-1].rsplit(".bed")[0] + "_pre_filtered.bed" - with open(bname, "w") as outfile: - for entry in final_filt_entries: - outfile.write("\t".join([str(x) for x in entry]) + "\n") - - return bname diff --git a/bin/collect_seeds.py b/bin/collect_seeds.py deleted file mode 100755 index 35e74a43..00000000 --- a/bin/collect_seeds.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python - -# Code adapted from PrepareAA (https://github.com/jluebeck/PrepareAA) -# commit/92b81ba55356af85958985b8f80308c8f88921ac - - -import argparse -from datetime import datetime -from subprocess import call - - -# Read the CNVkit .cns files -def collect_seeds(sample, cns): - with open(cns) as infile, open(sample + "_CNV_GAIN.bed", "w") as outfile: - head = next(infile).rstrip().rsplit("\t") - for line in infile: - fields = line.rstrip().rsplit("\t") - s, e = int(fields[1]), int(fields[2]) - cn_r = float(fields[4]) - cn = 2 ** (cn_r + 1) - if cn >= args.cngain: # do not filter on size since amplified_intervals.py will merge small ones. - outline = "\t".join(fields[0:3] + ["CNVkit", str(cn)]) + "\n" - outfile.write(outline) - return sample + "_CNV_GAIN.bed" - - -# MAIN # -if __name__ == "__main__": - # Parses the command line arguments - parser = argparse.ArgumentParser(description="Collect AmpliconArchitect Copy Number Seeds") - parser.add_argument("-s", "--sample", help="sample name", required=True) - parser.add_argument("--cns", help="CNVKit .cns file of CNV changes.", default="") - parser.add_argument( - "--cngain", - type=float, - help="CN gain threshold to consider for AA seeding", - default=4.5, - ) - args = parser.parse_args() - collect_seeds(args.sample, args.cns) diff --git a/bin/downsample.py b/bin/downsample.py deleted file mode 100755 index ec373808..00000000 --- a/bin/downsample.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/env python - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - -from time import time - -TSTART = time() -import pysam -import argparse -from time import time -import os -import matplotlib - -matplotlib.use("Agg") -import random - -import global_names - -parser = argparse.ArgumentParser(description="Reconstruct Amplicons connected to listed intervals.") -parser.add_argument( - "--bam", dest="bam", help="Coordinate sorted BAM file with index", metavar="FILE", action="store", type=str, nargs=1 -) -parser.add_argument( - "--final", - dest="final", - help="Optional Final coverage. Default is 10. If initial coverage is less than final, do nothing.", - metavar="FLOAT", - action="store", - type=float, - default=10.0, -) -parser.add_argument( - "--downsample_dir", - dest="downsample_dir", - help="Optional directory to output. Default is same as original bamfile", - metavar="DIR", - action="store", - type=str, - default="", -) -parser.add_argument( - "--cbam", - dest="cbam", - help="Optional bamfile to use for coverage calculation. Also generates new coverage bam file in downsample_dir.", - metavar="FILE", - action="store", - type=str, - default=None, -) -parser.add_argument( - "--cbed", - dest="cbed", - help="Optional bedfile defining 1000 10kbp genomic windows for coverage calcualtion", - metavar="FILE", - action="store", - type=str, - default=None, -) -parser.add_argument( - "--ref", - dest="ref", - help='Values: [hg19, GRCh37, GRCh38, GRCh38_viral, mm10, None]. "hg19", "mm10", "GRCh38" : chr1, .. chrM etc / "GRCh37" : \'1\', \'2\', .. \'MT\' etc/ "None" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected.', - metavar="STR", - action="store", - type=str, - required=True, -) -parser.add_argument( - "--cstats_only", - help="Compute the coverage statistics for the BAM file and exit. Do not perform any downsampling.", - action="store_true", -) -parser.add_argument( - "--random_seed", - dest="random_seed", - help="Set flag to use the numpy default random seed (sets np.random.seed(seed=None)), otherwise will use seed=0", - action="store_true", - default=False, -) - -args = parser.parse_args() - -global_names.REF = args.ref -global_names.TSTART = TSTART -if args.random_seed: - global_names.SEED = None - -import bam_to_breakpoint as b2b -from breakpoint_graph import * - - -if os.path.splitext(args.bam[0])[-1] == ".cram": - bamFile = pysam.Samfile(args.bam[0], "rc") -else: - bamFile = pysam.Samfile(args.bam[0], "rb") -cbam = None -if args.cbam is not None: - if os.path.splitext(args.cbam[0])[-1] == ".cram": - cbam = pysam.Samfile(args.cbam, "rc") - else: - cbam = pysam.Samfile(args.cbam, "rb") -cbed = args.cbed - - -coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats") -cstats = None -cb = bamFile -if cbam is not None: - cb = cbam - -for l in coverage_stats_file: - ll = l.strip().split() - bamfile_pathname = str(cb.filename.decode()) - if ll[0] == os.path.abspath(bamfile_pathname): - bamfile_filesize = os.path.getsize(bamfile_pathname) - - cstats = tuple(map(float, ll[1:])) - if len(cstats) < 15 or cstats[13] != 3 or bamfile_filesize != int(cstats[14]): # 3 is default sdevs - cstats = None - -coverage_stats_file.close() -coverage_windows = None -if cbed is not None: - coverage_windows = hg.interval_list(cbed, "bed") - coverage_windows.sort() -if cstats is None and cbam is not None: - cbam2b = b2b.bam_to_breakpoint(cbam, coverage_stats=cstats, coverage_windows=coverage_windows) - cstats = cbam2b.basic_stats -elif cstats is None: - bamFileb2b = b2b.bam_to_breakpoint(bamFile, coverage_stats=cstats, coverage_windows=coverage_windows) - cstats = bamFileb2b.basic_stats - -print("Estimated bamfile coverage is ", str(cstats[0])) -if args.cstats_only: - sys.exit(0) - -final = args.final - -if cstats[0] <= final: - exit() -ratio = float(final) / float(cstats[0]) - -print( - "Downsampling:", - args.bam[0], - "Estimated original coverage:", - float(cstats[0]), - "Desired final coverage:", - final, - "DS ratio:", - ratio, -) - -downsample_dir = os.path.dirname(os.path.abspath(args.bam[0])) -if args.downsample_dir != "": - downsample_dir = args.downsample_dir - -i = 0 -rulist = [] -t0 = time() -b2 = pysam.Samfile(downsample_dir + "/" + os.path.basename(args.bam[0])[:-4] + ".DS.bam", "wb", template=bamFile) - -seed_shift = str(t0) -if global_names.SEED is not None: - seed_shift = str(global_names.SEED) - -for a in bamFile.fetch(): - random.seed(a.query_name + seed_shift) - - ru = random.uniform(0, 1) - if ru < ratio: - b2.write(a) -b2.close() -pysam.index(downsample_dir + "/" + os.path.basename(args.bam[0])[:-4] + ".DS.bam") - -# if args.cbam is not None and not os.path.exists(downsample_dir + '/' + os.path.basename(args.cbam)[:-4] + '.DS.bam'): -# c2 = pysam.Samfile(downsample_dir + '/' + os.path.basename(args.cbam)[:-4] + '.DS.bam', 'wb', template = cbam) -# for a in cbam.fetch(): -# random.seed(a.qname) -# if random.uniform(0, 1) < ratio: -# c2.write(a) -# c2.close() -# pysam.index(downsample_dir + '/' + os.path.basename(args.cbam)[:-4] + '.DS.bam') diff --git a/bin/extract_circle_SV_reads.py b/bin/extract_circle_SV_reads.py old mode 100644 new mode 100755 diff --git a/bin/global_names.py b/bin/global_names.py deleted file mode 100755 index 2cb38852..00000000 --- a/bin/global_names.py +++ /dev/null @@ -1,5 +0,0 @@ -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 -REF = "hg19" -TSTART = 0 -SEED = 0 diff --git a/bin/hg19util.py b/bin/hg19util.py deleted file mode 100755 index f2e5b50c..00000000 --- a/bin/hg19util.py +++ /dev/null @@ -1,863 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com - - -##This is a suite to load reference genome (not just hg19, as filename implies), genes, exons, repeat content and perform operations on this genome, compare variants -## it handles annotations from a database and is not restricted to solely hg19 if global_names.REF is not hg19. - -import sys -from bisect import bisect_left -from collections import defaultdict -from time import clock -import pysam -import heapq -import copy -import os -import logging - -if sys.version_info < (3, 0): - from sets import Set - -import global_names - -try: - DATA_REPO = os.environ["AA_DATA_REPO"] -except: - logging.warning( - "#TIME " + "%.3f\t" % clock() + " Unable to set AA_DATA_REPO variable. Setting to working directory" - ) - DATA_REPO = "." -if DATA_REPO == "." or DATA_REPO == "": - logging.warning("#TIME " + "%.3f\t" % clock() + " AA_DATA_REPO not set or empy. Setting to working directory") - DATA_REPO = "." - -REF = global_names.REF -print("Global ref name is " + REF) - -REF_files = defaultdict(lambda: "", {}) -try: - for l in open(DATA_REPO + "/" + REF + "/file_list.txt"): - REF_files[l.strip().split()[0]] = l.strip().split()[1] -except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + " Unable to find reference in $AA_DATA_REPO/REF/file_list.txt. Setting to empty." - ) - - -class fake_fasta(object): - def fetch(self, a=None, b=0, c=0): - return "".join(["N" for i in range(c - b + 1)]) - - -try: - fa_file = pysam.Fastafile(DATA_REPO + "/" + REF + "/" + REF_files["fa_file"]) -except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + ' Unable to open fasta file: "' - + DATA_REPO - + "/" - + REF - + "/" - + REF_files["fa_file"] - + '". Reference sequences will be set to N.' - ) - fa_file = fake_fasta() - -chrLen_filename = DATA_REPO + "/" + REF + "/" + REF_files["chrLen_file"] -duke35_filename = DATA_REPO + "/" + REF + "/" + REF_files["duke35_filename"] -wgexclude_filename = DATA_REPO + "/" + REF + "/" + REF_files["mapability_exclude_filename"] -gene_filename = DATA_REPO + "/" + REF + "/" + REF_files["gene_filename"] -exon_filename = DATA_REPO + "/" + REF + "/" + REF_files["exon_file"] -oncogene_filename = DATA_REPO + "/" + REF + "/" + REF_files["oncogene_filename"] -centromere_filename = DATA_REPO + "/" + REF + "/" + REF_files["centromere_filename"] -conserved_regions_filename = DATA_REPO + "/" + REF + "/" + REF_files["conserved_regions_filename"] -segdup_filename = DATA_REPO + "/" + REF + "/" + REF_files["segdup_filename"] -complementary_nucleotide = defaultdict( - lambda: "N", - { - "A": "T", - "C": "G", - "G": "C", - "T": "A", - "a": "t", - "c": "g", - "g": "c", - "t": "a", - "n": "n", - "N": "N", - }, -) -duke35 = [] -duke35_exists = [True] - -# Handling chromosome names, lengths, sorting, positions and addition of new chromosomes -chr_id = {} -chrName = {} - - -def chrNum(chrname, mode="append"): - if chrname in chr_id: - return chr_id[chrname] - else: - if mode == "init": - cnum = len(chr_id) - else: - cnum = 1000000 + len(chr_id) - chr_id[chrname] = cnum - chrName[cnum] = chrname - return chr_id[chrname] - - -chrLen = defaultdict(lambda: 0, {}) -try: - for line in open(chrLen_filename): - ll = line.strip().split() - chrLen[chrNum(ll[0], mode="init")] = int(ll[1]) -except: - logging.warning( - "#TIME " + "%.3f\t" % clock() + ' Unable to open chromosome lengths file: "' + chrLen_filename + '"' - ) - -chrOffset = {} - - -def absPos(chrname, pos=0): - cnum = chrNum(chrname) - if chrNum(chrname) not in chrOffset: - chrkeys = sorted(chrName.keys()) - sumlen = sum([chrLen[c] for c in chrLen if c in chrOffset]) - for i in range(len(chrkeys)): - if chrkeys[i] not in chrOffset: - chrOffset[chrkeys[i]] = sumlen - sumlen += chrLen[chrkeys[i]] - if cnum < chrkeys[i]: - break - return chrOffset[chrNum(chrname)] + pos - - -for c in chrLen: - ap = absPos(chrName[c]) - - -def chrPos(abspos): - for c in chrOffset: - if chrOffset[c] < abspos and chrOffset[c] + chrLen[c] >= abspos: - return (chrName[c], abspos - chrOffset[c]) - return None - - -def update_chrLen(len_list): - for l in len_list: - chrLen[chrNum(l[0])] = int(l[1]) - for l in len_list: - cpos = absPos(l[0], 1) - - -def reverse_complement(seq): - return "".join([complementary_nucleotide[a] for a in seq][::-1]) - - -class interval(object): - def __init__( - self, - line, - start=-1, - end=-1, - strand=1, - file_format="", - bamfile=None, - info="", - exclude_info_string=False, - ): - self.info = "" - self.file_format = file_format - if type(line) == pysam.AlignedRead or type(line) == pysam.AlignedSegment: - self.load_pysamread(line, bamfile) - elif start == -1: - self.load_line(line, file_format, exclude_info_string=exclude_info_string) - elif end == -1: - self.load_pos(line, start, start, strand) - else: - self.load_pos(line, start, end, strand) - if len(info) > 0: - self.info = info - - def load_line(self, line, file_format, exclude_info_string=False): - if file_format == "": - if len(line.strip().split()) == 1: - self.chrom = line.split(":")[0] - self.start = int(line.split(":")[1].split("-")[0]) - if "-" not in line: - self.end = int(line.split(":")[1].split("-")[0]) - else: - self.end = int(line.split(":")[1].split("-")[1]) - if self.start < self.end: - self.strand = 1 - else: - self.strand = -1 - return - else: - file_format = "bed" - if file_format == "gff": - ll = line.strip().split() - self.chrom = ll[0] - self.start, self.end = sorted([int(float(ll[3])), int(float(ll[4]))]) - if ll[6] == "+": - self.strand = 1 - else: - self.strand = -1 - if not exclude_info_string: - self.info = {r[0 : r.find("=")]: r[r.find("=") + 1 :] for r in ll[8].strip().strip(";").split(";")} - self.info["Variant"] = ll[5] - elif file_format == "bed": - ll = line.strip().split() - self.chrom = ll[0] - if (REF == "hg19" or REF == "GRCh38") and 0 < len(self.chrom) < 3: - try: - ci = int(self.chrom) - if 0 < ci < 23: - self.chrom = "chr" + self.chrom - logging.info("Corrected chromosome name (appended 'chr') " + self.chrom + " \n") - - except ValueError: - if self.chrom in {"M", "X", "Y"}: - self.chrom = "chr" + self.chrom - else: - logging.warning("Chromosome name " + self.chrom + " may be incompatible") - - self.start, self.end = sorted([int(float(ll[1])), int(float(ll[2]))]) - if int(float(ll[2])) >= int(float(ll[1])): - self.strand = 1 - else: - self.strand = -1 - if not exclude_info_string: - self.info = ll[3:] - else: - raise (Exception("Invalid interval format" + str(line))) - - def load_pos(self, chrom, start, end, strand): - self.chrom = chrom - self.start = int(start) - self.end = int(end) - self.strand = strand - if start > end: - self.start = int(end) - self.end = int(start) - self.strand = -1 * strand - - def load_pysamread(self, line, bamfile): - if bamfile is None: - raise Exception("Interval of pysam AlignedRead without bamfile") - self.chrom = line.reference_name - self.start = line.reference_start - self.end = 0 - if line.reference_end is not None: - self.end = line.reference_end - else: - logging.warning("Reference_end for " + str(self) + " was NoneType. Setting to 0.") - - if line.is_reverse: - self.strand = -1 - else: - self.strand = 1 - - def __gt__(self, y): - if self.chrom != y.chrom: - return chrNum(self.chrom) > chrNum(y.chrom) - elif int(self.end) != int(y.end): - return int(self.end) > int(y.end) - else: - return int(self.start) > int(y.start) - - def size(self): - return self.end - self.start + 1 - - def __str__(self): - if len(str(self.info)) == 0: - return "\t".join(map(str, [self.chrom, self.start, self.end])) - elif type(self.info) == list: - return "\t".join(map(str, [self.chrom, self.start, self.end] + self.info)) - elif type(self.info) == dict: - return "\t".join( - map( - str, - [self.chrom, self.start, self.end] + [str(s) + "=" + str(self.info[s]) for s in self.info], - ) - ) - else: - return "\t".join(map(str, [self.chrom, self.start, self.end, self.info])) - - def gc_content(self): - seq = fa_file.fetch(self.chrom, self.start, self.end) - # if 'G' in seq: - # print seq, seq.count('G'), seq.count('C'), float(seq.count('G') + seq.count('C')) / len(seq) - # exit() - if len(seq) == 0: - return 0.5 - return float(seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c")) / len(seq) - - def sequence(self, new_fa_file=None): - if new_fa_file is not None: - seq = new_fa_file.fetch(self.chrom, self.start, self.end + 1) - else: - seq = fa_file.fetch(self.chrom, self.start, self.end + 1) - if self.strand == 1: - return seq - else: - return "".join([complementary_nucleotide[a] for a in seq][::-1]) - - def intersects(self, n, extend=0, margin=0.0): - if margin > 0.0: - if self.intersects( - interval(n.chrom, n.start, n.end - (1 - margin) * (n.end - n.start)) - ) and self.intersects(interval(n.chrom, n.start + (1 - margin) * (n.end - n.start)), n.end): - return True - else: - s = self - if n.intersects(interval(s.chrom, s.start, s.end - (1 - margin) * (s.end - s.start))) and n.intersects( - interval(s.chrom, s.start + (1 - margin) * (s.end - s.start)), s.end - ): - return True - return False - a = [self.chrom, max(0, self.start - extend), self.end + extend] - b = [n.chrom, n.start, n.end] - if a[0] != b[0]: - return False - if (int(a[1]) - int(b[1])) * (int(a[2]) - int(b[1])) <= 0: - return True - if (int(a[1]) - int(b[2])) * (int(a[2]) - int(b[2])) <= 0: - return True - if (int(a[1]) - int(b[1])) * (int(a[1]) - int(b[2])) <= 0: - return True - if (int(a[2]) - int(b[1])) * (int(a[2]) - int(b[2])) <= 0: - return True - return False - - def intersection(self, y): - if not self.intersects(y): - return None - return interval(self.chrom, max(self.start, y.start), min(self.end, y.end)) - - def merge(self, y, extend=0): - if not self.intersects(y, extend): - return None - return interval(self.chrom, min(self.start, y.start), max(self.end, y.end)) - - def atomize(self, y): - il = interval_list([self, y]) - il.sort() - ilr = [] - if il[0].intersects(il[1]): - ilint = il[0].intersection(il[1]) - if il[0].start < il[1].start: - ilr.append((interval(il[0].chrom, il[0].start, ilint.start - 1), [il[0]])) - elif il[1].start < il[0].start: - ilr.append((interval(il[1].chrom, il[1].start, ilint.start - 1), [il[1]])) - ilr.append((ilint, il)) - if il[0].end > il[1].end: - ilr.append((interval(il[0].chrom, ilint.end + 1, il[0].end), [il[0]])) - elif il[1].end > il[0].end: - ilr.append((interval(il[1].chrom, ilint.end + 1, il[1].end), [il[1]])) - return ilr - else: - return [(il[0], [il[0]]), (il[1], [il[1]])] - - def contains(self, x, y=-1, z=-1): - if type(x) == interval: - if self.intersects(x) and self.intersection(x).size() == x.size(): - return True - else: - return False - if y != -1: - if z == -1: - z = y - if ( - self.intersects(interval(x, y, z)) - and self.intersection(interval(x, y, z)).size() == interval(x, y, z).size() - ): - return True - return False - - def filter_repeat(self): - if len(interval_list([self]).intersection(wgexclude)) > 0: - return True - if len(interval_list([self]).intersection(conserved_regions)) > 0: - return True - if self.rep_content() > 4.5: - return True - return False - - def rep_content(self): - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: init ") - if self.chrom == "chrM" or self.chrom == "MT": - return 5.0 - if self.chrom.strip("chr") not in map(str, range(1, 23)) + ["X" + "Y"]: - return 1.0 - s34 = interval(self.chrom, self.start, max(self.start, self.end - 34)) - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: to load duke ") - if duke35_exists[0] and len(duke35) == 0: - try: - duke35file = open(duke35_filename) - duke35.extend([l.strip() for l in duke35file]) - duke35file.close() - except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + ' rep_content: Unable to open mapability file "' - + duke35_filename - + '".' - ) - duke35_exists[0] = False - duke35.extend(["chr_Un 0 1 1"]) - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: duke loaded") - ictime = 0 - itime = 0 - hi = len(duke35) - 1 - lo = 0 - numiter = 0 - while hi - lo > 1: - numiter += 1 - p = (hi + lo) / 2 - ctime = clock() - m = interval(duke35[p]) - ictime += clock() - ctime - ctime = clock() - if s34.intersects(m) or m > s34: - hi = p - else: - lo = p - itime += clock() - ctime - p = lo - m = interval(duke35[p]) - sum_duke = 0 - len_duke = 0 - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: found " + str(numiter) + " " + str(ictime) + " " + str(itime)) - while s34 > m or s34.intersects(m): - if not s34.intersects(m): - p += 1 - if p >= len(duke35) or p <= 0: - raise Exception( - "p index out of range: " - + str(p) - + " " - + str(lo) - + " " - + str(self) - + " " - + str(m) - + " " - + str(interval(duke35[lo])) - ) - m = interval(duke35[p]) - continue - repc = 5.0 if float(m.info[0]) == 0 else 1.0 / float(m.info[0]) - sum_duke += s34.intersection(m).size() * repc - len_duke += s34.intersection(m).size() - p += 1 - if p >= len(duke35): - break - m = interval(duke35[p]) - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: done") - # exit() - if len_duke > 0: - return sum_duke / len_duke - else: - return 1.0 - - def num_unmasked(self): - if self.chrom not in fa_file.references: - return self.size() - seq = fa_file.fetch(self.chrom, self.start, self.end) - return len([c for c in seq if c in "ACGT"]) - - def segdup_uniqueness(self): - sl = interval_list([self]).intersection(segdup_list) - slsd = sum([self.intersection(i[1]).size() for i in sl]) - return float(self.size()) / (self.size() + slsd) - - def extend(self, extend_len=0): - return interval( - self.chrom, - max(0, self.start - extend_len), - min(self.end + extend_len, chrLen[chrNum(self.chrom)]), - self.strand, - ) - - -class interval_list(list, object): - def __init__(self, ilist=None, file_format=None, sort=True, exclude_info_string=False): - if ilist == None: - ilist = [] - self.file_format = file_format - if file_format in ["bed", "gff"]: - self.bed_to_list(ilist, exclude_info_string=exclude_info_string) - if file_format is None: - list.__init__(self, ilist) - if sort: - self.sort() - self.offset = None - - def bed_to_list(self, file_name, exclude_info_string=False): - if file_name is not None: - try: - f = open(file_name) - list.__init__( - self, - [ - interval( - l, - file_format=self.file_format, - exclude_info_string=exclude_info_string, - ) - for l in f - if len(l.strip().split()) > 2 and l.strip()[0] != "#" - ], - ) - f.close() - except: - logging.warning( - "#TIME " + "%.3f\t" % clock() + ' interval_list: Unable to open interval file "' + file_name + '".' - ) - - def merge_clusters(self, extend=0, margin=0.0): - ml = [] - ci = None - cl = [] - ai = 0 - cend = len(self) - for a in self[::-1]: - ai += 1 - if ci is None or not a.intersects(ci, extend, margin): - cstart = len(self) - ai + 1 - cl = self[cstart:cend] - if ci is not None: - ml.append((ci, cl)) - ci = a - cl = [] - cend = len(self) - ai + 1 - # if ai != sum([len(m[1]) for m in ml]) + 1: - # print "divergent", ai, str(a) - # exit() - ci = ci.merge(a, extend) - # cl.append(a) - cstart = 0 - cl = self[cstart:cend] - if ci is not None: - ml.append((ci, cl)) - return ml[::-1] - - def repeats(self, count=1): - activeq = [] - if activeq is None: - print("h1") - exit() - jinterval = None - ilist = [] - for a in self[::-1]: - while len(activeq) > 0 and not a.intersects(activeq[0][1]): - heapq.heappop(activeq) - if activeq is None: - print("h2") - exit() - if len(activeq) < count and jinterval is not None: - ilist.append((jinterval, copy.copy(aq))) - if activeq is None: - print("h3") - exit() - jinterval = None - heapq.heappush(activeq, (-1 * a.start, a)) - if len(activeq) >= count: - if jinterval is None: - jinterval = interval(a.chrom, activeq[0][1].start, a.end) - aq = copy.copy(activeq) - else: - jinterval.start = min(jinterval.start, activeq[0][1].start) - heapq.heappush(aq, (-1 * a.start, a)) - if jinterval is not None: - ilist.append((jinterval, copy.copy(aq))) - jinterval = None - return ilist[::-1] - - def intersection(self, l2, extend=0): - si = len(self) - 1 - l2i = len(l2) - 1 - sj = len(self) - 1 - l2j = len(l2) - 1 - il = [] - while si >= 0: - while l2i >= 0 and l2[l2i] > self[si] and not self[si].intersects(l2[l2i], extend=extend): - l2i -= 1 - l2j = l2i - while l2j >= 0 and self[si].intersects(l2[l2j], extend=extend): - il.append((self[si], l2[l2j])) - l2j -= 1 - si -= 1 - return il[::-1] - - def atomize(self, h2): - i = 0 - j = 0 - atomlist = [] - if len(self) > 0: - c1 = self[0] - if len(h2) > 0: - c2 = h2[0] - c = None - while i < len(self) or j < len(h2): - # if c is not None: - # print "%%", i, j, str(c[0]), [str(aa) for aa in c[1]], [str(aa[0]) for aa in atomlist] - # else: - # print "%%", i, j, [], [str(aa[0]) for aa in atomlist] - if c is not None: - if i < len(self) and self[i] not in c[1] and (self[i].intersects(c[0], -1) or c[0] > self[i]): - atm = self[i].atomize(c[0]) - atm = [ - ( - aa[0], - [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]], - ) - for aa in atm - ] - # print "%i", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] - c = atm[-1] - i += 1 - atomlist += atm[:-1] - elif j < len(h2) and h2[j] not in c[1] and (h2[j].intersects(c[0], -1) or c[0] > h2[j]): - # print j, str(h2[j]), str(c[0]), c[0] > h2[j] - atm = c[0].atomize(h2[j]) - atm = [ - ( - aa[0], - [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]], - ) - for aa in atm - ] - # print "%j", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] - c = atm[-1] - j += 1 - atomlist += atm[:-1] - else: - atomlist.append(c) - # if i < len(self) and self[i] in c[1]: - # i += 1 - # if j < len(h2) and h2[j] in c[1]: - # j += 1 - c = None - else: - if i >= len(self): - atomlist.append((h2[j], [h2[j]])) - j += 1 - elif j >= len(h2): - atomlist.append((self[i], [self[i]])) - i += 1 - else: - atm = self[i].atomize(h2[j]) - atomlist += atm[:-1] - c = atm[-1] - # if self[i] not in c[1]: - i += 1 - # if h2[j] not in c[1]: - j += 1 - if c is not None: - atomlist.append(c) - return atomlist - - def get_repeat_content(self): - try: - duke35_file = open(duke35_filename) - print("counting repeats", clock()) - self.sort() - sum_duke = [0.0 for i in self] - len_duke = [0.0 for i in self] - lno = 0 - i = 0 - j = 0 - for line in duke35_file: - lno += 1 - duke_int = interval(line) - while not (duke_int.intersects(self[i])) and duke_int > self[i]: - i += 1 - if not duke_int.intersects(self[i]) and self[i] > duke_int: - continue - j = i - repc = 5.0 if float(duke_int.info[0]) == 0 else 1 / float(duke_int.info[0]) - while j < len(self) and self[j].intersects(duke_int): - sum_duke[j] += self[j].intersection(duke_int).size() * repc - len_duke[j] += self[j].intersection(duke_int).size() - j += 1 - duke35_file.close() - return {self[i]: sum_duke[i] / len_duke[i] for i in range(len(interval_list))} - except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + ' get_repeat_content: Unable to open mapability file "' - + duke35_filename - + '".' - ) - duke35_exists[0] = False - duke35.extend(["chr_Un 0 1 1"]) - return {self[i]: 1.0 for i in range(len(interval_list))} - - def offsets(self): - if self.offset is not None: - return self.offset - gap = 0.1 - hratio = 0.8 - - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) - h_count = len(self) - v_count - h_sum = sum([i.size() for i in hlist]) - v_sum = sum([i.size() for i in vlist]) - - hK = len([i for i in hlist if i.size() < h_sum * gap / max(1, h_count)]) - hS = sum([i.size() for i in hlist if i.size > h_sum * gap / max(1, h_count)]) - min_hsize = hS / (max(1, h_count) / gap - hK) - h_sum = hS + hK * min_hsize - - vK = len([i for i in vlist if i.size() < v_sum * gap / max(1, v_count)]) - vS = sum([i.size() for i in vlist if i.size > v_sum * gap / max(1, v_count)]) - min_vsize = vS / (max(1, v_count) / gap - vK) - v_sum = vS + vK * min_vsize - - offset = {} - - h_start = 0 - hscale = 1 if v_count == 0 else hratio - v_start = 0 if h_count == 0 else hratio - vscale = 1 if h_count == 0 else (1 - hratio) - - hgap = gap / h_count if h_count > 0 else 0 - vgap = gap / v_count if v_count > 0 else 0 - hpos = h_start + (hgap / 2) * hscale - vpos = v_start + (vgap / 2) * vscale - for i in hlist: - isize = max(i.size(), min_hsize) - offset[i] = (hpos, hpos + ((1 - gap) * isize / h_sum) * hscale) - hpos = hpos + ((1 - gap) * isize / h_sum + hgap) * hscale - for i in vlist: - isize = max(i.size(), min_vsize) - offset[i] = (vpos, vpos + ((1 - gap) * isize / v_sum) * vscale) - vpos = vpos + ((1 - gap) * isize / v_sum + vgap) * vscale - self.offset = offset - # for i in self: - # print str(i), offset[i], i.size(), hgap, h_sum, hscale, gap, hpos, vpos - # exit() - return offset - - def xpos(self, chrom, pos): - offset = self.offsets() - for i in self: - if i.intersects(interval(chrom, max(0, pos - 1), pos)): - o = offset[i] - return (o[1] * (pos - i.start) + o[0] * (i.end - pos)) / (i.end - i.start) - return None - - def offset_breaks(self): - offset = self.offsets() - gap = 0.1 - hratio = 0.8 - - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) - h_count = len(self) - v_count - h_sum = sum([i.size() for i in hlist]) - v_sum = sum([i.size() for i in vlist]) - - hscale = 1 if v_count == 0 else hratio - vscale = 1 if h_count == 0 else (1 - hratio) - - hgap = gap / h_count if h_count > 0 else 0 - vgap = gap / v_count if v_count > 0 else 0 - - breaks = [] - iprev = None - for i in self: - if iprev is None: - iprev = i - continue - if i in hlist and iprev.chrom == i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, ":", i.chrom)) - print(str(i), str(iprev), i in hlist, iprev.chrom == i.chrom) - elif i in hlist and iprev.chrom != i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, "--", i.chrom)) - elif i in vlist and iprev in hlist: - breaks.append((offset[i][0] - vscale * vgap / 2, "-", i.chrom)) - elif i in vlist and i.chrom == iprev.chrom: - breaks.append((offset[i][0] - vscale * vgap / 2, ":", i.chrom)) - else: - breaks.append((offset[i][0] - vscale * vgap / 2, "--", i.chrom)) - - iprev = i - return breaks - - def __str__(self): - return str(([str(i) for i in self])) - - -oncogene_list = interval_list(oncogene_filename, "gff") -oncogene_list.sort() -gene_list = interval_list(gene_filename, "gff") - - -exon_list = interval_list([]) - - -def load_exons(): - if len(exon_list) > 0: - return - try: - exon_file = open(exon_filename) - exonFields = [ - interval(j, file_format="gff") - for j in exon_file.read().strip().split("\n") - if ( - len(j.strip()) > 0 - and j.strip()[0] != "#" - and {r.split("=")[0]: r.split("=")[1] for r in j.strip().split()[8].strip(";").split(";")}["color"] - == "000080" - ) - ] - exon_file.close() - exon_list.extend((exonFields)) - except: - logging.warning("#TIME " + "%.3f\t" % clock() + 'unable to load exon file: "' + exon_filename + '"') - - -conserved_regions = interval_list(conserved_regions_filename, "bed") -conserved_regions.sort() - -wgexclude = interval_list(wgexclude_filename, "bed") -wgexclude.sort() - -centromere_list = interval_list(centromere_filename, "bed") -centromere_list.sort() -centromere_list = interval_list([i[0] for i in centromere_list.merge_clusters(extend=1)]) - - -segdup_list = interval_list(segdup_filename, "bed") -segdup_list.sort() diff --git a/bin/mosek_solver.py b/bin/mosek_solver.py deleted file mode 100644 index 36bea415..00000000 --- a/bin/mosek_solver.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - -# Interface to MOSEK for AmpliconArchitect for Python3 -# -# Supports all versions of MOSEK >= 8 -# -import logging -import sys - -import mosek - -# Check Mosek version -mosek_ver = mosek.Env.getversion() -logging.info("Mosek version is {}".format(".".join([str(x) for x in mosek_ver]))) -mosek_major = mosek_ver[0] - -if sys.version_info < (3, 0) and mosek_major >= 10: - logging.warning("Mosek version is " + ".".join([str(x) for x in mosek_ver]) + " which requires python3. Exiting.\n") - sys.exit(1) - - -# MOSEK logging -mosek_logger = logging.getLogger("MOSEK") - - -def moseklogfunc(msg): - mosek_logger.debug(msg.rstrip()) - - -class fusionlogger: - def write(self, msg): - moseklogfunc(msg) - - def flush(self): - pass - - -# Calls MOSEK to solve one instance of the problem -def call_mosek(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h): - mosek_logger.info("Beginning MOSEK call") - - ## Enable this line to ALWAYS save all Mosek inputs - # save_mosek_input(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h) - - try: - # Determine which MOSEK routing to call - if mosek_major == 8: - return call_mosek_scopt(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h) - elif mosek_major == 9: - return call_mosek_fusion(n, m, asub, aval, coeff_c, coeff_f) - elif mosek_major >= 10: - return call_mosek_acc(n, m, asub, aval, coeff_c, coeff_f) - else: - raise Exception("Unsupported MOSEK version {}".format(mosek_major)) - except Exception as e: - # If an error occurred in the MOSEK call then save - # all input data to a JSON file so they can be loaded - # to recreate the MOSEK problem in a stand-alone way. - mosek_logger.error("Error when using MOSEK: {}".format(e)) - print("Error when using MOSEK: {}".format(e)) - filename = save_mosek_input(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h) - mosek_logger.info( - "Saved MOSEK inputs to {}. Submit that file to support to reproduce the issue.".format(filename) - ) - raise e - - -""" -This method works with MOSEK == 8. - -Solves the problem - -minimize c^T * x - sum_i(f_i * log(g_i * x_i + h_i)) -subject to A * x == 0 - x >= 0 -""" - - -def call_mosek_scopt(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h): - with mosek.Env() as env: - with env.Task() as task: - task.set_Stream(mosek.streamtype.log, moseklogfunc) - - numvar = n + m - numcon = 2 * n - - task.appendcons(numcon) - task.appendvars(numvar) - - task.putvarboundslice(0, numvar, [mosek.boundkey.lo] * numvar, [0.0] * numvar, [0.0] * numvar) - task.putconboundslice(0, numcon, [mosek.boundkey.fx] * numcon, [0.0] * numcon, [0.0] * numcon) - - for i in range(numcon): - task.putarow(i, asub[i], aval[i]) - - task.putclist(range(numvar), coeff_c) - - task.putobjsense(mosek.objsense.minimize) - - task.putSCeval([mosek.scopr.log] * (n + m), range(n + m), coeff_f, coeff_g, const_h) - - task.optimize() - task.solutionsummary(mosek.streamtype.log) - - if task.getsolsta(mosek.soltype.itr) != mosek.solsta.optimal: - raise Exception( - "Failed to solve to optimality. Solution status {}".format(task.getsolsta(mosek.soltype.itr)) - ) - - res = [0.0] * numvar - task.getsolutionslice(mosek.soltype.itr, mosek.solitem.xx, 0, numvar, res) - - return res - - -""" -This method works with MOSEK >= 10. - -Solves the problem - -minimize c^T * x - sum_i(f_i * log(x_i)) -subject to A * x == 0 - -Comments, compared to MOSEK 8 model: - -We ignore the normalizing coefficient h_i from log(g_i * x_i + h_i) and consider only log(g_i * x_i). -Subject to that change we can also skip g_i since it only changes the constant term in the objective. -The condition x>=0 is implicit by x appearing in the logarithm. -""" - - -def call_mosek_acc(n, m, asub, aval, coeff_c, coeff_f): - with mosek.Task() as task: - task.set_Stream(mosek.streamtype.log, moseklogfunc) - - task.appendvars(2 * (n + m)) - task.appendcons(2 * n) - task.putvarboundsliceconst(0, 2 * (n + m), mosek.boundkey.fr, 0, 0) - - for i in range(2 * n): - task.putarow(i, asub[i], aval[i]) - - task.putconboundsliceconst(0, 2 * n, mosek.boundkey.fx, 0, 0) - - task.appendafes(2 * (n + m) + 1) - task.putafefentrylist(range(0, 2 * (n + m)), range(0, 2 * (n + m)), [1.0] * (2 * (n + m))) - task.putafeg(2 * (n + m), 1.0) - - expdom = task.appendprimalexpconedomain() - task.appendaccs([expdom] * (n + m), sum([[i, 2 * (n + m), i + n + m] for i in range(n + m)], []), None) - - task.putclist(range(0, n + m), coeff_c) - task.putclist(range(n + m, 2 * (n + m)), coeff_f) - - task.putobjsense(mosek.objsense.minimize) - - task.optimize() - task.solutionsummary(mosek.streamtype.log) - - if task.getsolsta(mosek.soltype.itr) != mosek.solsta.optimal: - raise Exception( - "Failed to solve to optimality. Solution status {}".format(task.getsolsta(mosek.soltype.itr)) - ) - - return task.getxxslice(mosek.soltype.itr, 0, n + m) - - -""" -This method works with MOSEK >= 9. - -Solves the problem - -minimize c^T * x - sum_i(f_i * log(x_i)) -subject to A * x == 0 - -Comments, compared to MOSEK 10 model: - -A simple model in the higher level MOSEK Fusion. Anyhow, we do not expect MOSEK 9 users, really. -Either stay with MOSEK 8 or otherwise there is no reason not to upgrade all the way to MOSEK 10. - -This model can be used in MOSEK >= 9, but it invokes the additional Fusion modeling layer, -which the model from call_mosek_acc skips. If it behaves well though, we could make -it the default. It should be fast enough, and is more readable. -""" - - -def call_mosek_fusion(n, m, asub, aval, coeff_c, coeff_f): - from mosek.fusion import Model, Domain, Expr, Matrix, ObjectiveSense, SolutionStatus - - with Model() as M: - M.setLogHandler(fusionlogger()) - - x = M.variable(n + m) - t = M.variable(n + m) - - for i in range(2 * n): - M.constraint(Expr.dot(aval[i], x.pick(asub[i])), Domain.equalsTo(0)) - - M.constraint(Expr.hstack(x, Expr.constTerm(n + m, 1.0), t), Domain.inPExpCone()) - - M.objective(ObjectiveSense.Minimize, Expr.add(Expr.dot(coeff_c, x), Expr.dot(coeff_f, t))) - - M.solve() - - if M.getPrimalSolutionStatus() != SolutionStatus.Optimal: - raise Exception("Failed to solve to optimality. Solution status {}".format(M.getPrimalSolutionStatus())) - - return x.level() - - -# Debug functions. Dumping input data. -mosek_save_num = 1 - - -def save_mosek_input(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h): - import json - - global mosek_save_num - filename = "mosekinput-{}.json".format(mosek_save_num) - data = { - "n": n, - "m": m, - "asub": asub, - "aval": aval, - "coeff_c": coeff_c, - "coeff_f": coeff_f, - "coeff_g": coeff_g, - "const_h": const_h, - } - - with open(filename, "w") as f: - json.dump(data, f) - - mosek_save_num += 1 - return filename - - -# Debug functions. Loading input data. -def load_mosek_input(filename): - import json - - with open(filename, "r") as f: - data = json.load(f) - return ( - data["n"], - data["m"], - data["asub"], - data["aval"], - data["coeff_c"], - data["coeff_f"], - data["coeff_g"], - data["const_h"], - ) diff --git a/bin/mycolors.py b/bin/mycolors.py deleted file mode 100755 index 56994e2e..00000000 --- a/bin/mycolors.py +++ /dev/null @@ -1,136 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Maintained by Jens Luebeck, jluebeck@ucsd.edu -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - - -chrcolor = { - "b": "b", - "g": "g", - "r": "r", - "c": "c", - "m": "m", - "y": "y", - "k": "k", - "w": "w", - "chr1": (153 / 256.0, 102 / 256.0, 0 / 256.0), - "chr2": (102 / 256.0, 102 / 256.0, 0 / 256.0), - "chr3": (153 / 256.0, 153 / 256.0, 30 / 256.0), - "chr4": (204 / 256.0, 0 / 256.0, 0 / 256.0), - "chr5": (255 / 256.0, 0 / 256.0, 0 / 256.0), - "chr6": (255 / 256.0, 0 / 256.0, 204 / 256.0), - "chr7": (255 / 256.0, 204 / 256.0, 204 / 256.0), - "chr8": (255 / 256.0, 153 / 256.0, 0 / 256.0), - "chr9": (255 / 256.0, 204 / 256.0, 0 / 256.0), - "chr10": (255 / 256.0, 255 / 256.0, 0 / 256.0), - "chr11": (204 / 256.0, 255 / 256.0, 0 / 256.0), - "chr12": (0 / 256.0, 255 / 256.0, 0 / 256.0), - "chr13": (53 / 256.0, 128 / 256.0, 0 / 256.0), - "chr14": (0 / 256.0, 0 / 256.0, 204 / 256.0), - "chr15": (102 / 256.0, 153 / 256.0, 255 / 256.0), - "chr16": (153 / 256.0, 204 / 256.0, 255 / 256.0), - "chr17": (0 / 256.0, 255 / 256.0, 255 / 256.0), - "chr18": (204 / 256.0, 255 / 256.0, 255 / 256.0), - "chr19": (153 / 256.0, 0 / 256.0, 204 / 256.0), - "chr20": (204 / 256.0, 51 / 256.0, 255 / 256.0), - "chr21": (204 / 256.0, 153 / 256.0, 255 / 256.0), - "chr22": (102 / 256.0, 102 / 256.0, 102 / 256.0), - "chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "chrM": (204 / 256.0, 204 / 256.0, 153 / 256.0), - "chr0": (204 / 256.0, 204 / 256.0, 153 / 256.0), - "chrUn": (121 / 256.0, 204 / 256.0, 61 / 256.0), - "chrNA": (255 / 256.0, 255 / 256.0, 255 / 256.0), - "lum90chr1": (255 / 256.0, 216 / 256.0, 156 / 256.0), - "lum90chr2": (230 / 256.0, 230 / 256.0, 165 / 256.0), - "lum90chr3": (232 / 256.0, 232 / 256.0, 135 / 256.0), - "lum90chr4": (255 / 256.0, 166 / 256.0, 166 / 256.0), - "lum90chr5": (255 / 256.0, 147 / 256.0, 147 / 256.0), - "lum90chr6": (255 / 256.0, 152 / 256.0, 255 / 256.0), - "lum90chr7": (255 / 256.0, 214 / 256.0, 214 / 256.0), - "lum90chr8": (255 / 256.0, 202 / 256.0, 102 / 256.0), - "lum90chr9": (255 / 256.0, 220 / 256.0, 58 / 256.0), - "lum90chr10": (234 / 256.0, 234 / 256.0, 0 / 256.0), - "lum90chr11": (194 / 256.0, 245 / 256.0, 0 / 256.0), - "lum90chr12": (34 / 256.0, 255 / 256.0, 34 / 256.0), - "lum90chr13": (174 / 256.0, 244 / 256.0, 155 / 256.0), - "lum90chr14": (215 / 256.0, 215 / 256.0, 255 / 256.0), - "lum90chr15": (182 / 256.0, 224 / 256.0, 255 / 256.0), - "lum90chr16": (182 / 256.0, 231 / 256.0, 255 / 256.0), - "lum90chr17": (0 / 256.0, 252 / 256.0, 252 / 256.0), - "lum90chr18": (185 / 256.0, 236 / 256.0, 236 / 256.0), - "lum90chr19": (255 / 256.0, 191 / 256.0, 255 / 256.0), - "lum90chr20": (255 / 256.0, 177 / 256.0, 255 / 256.0), - "lum90chr21": (255 / 256.0, 206 / 256.0, 255 / 256.0), - "lum90chr22": (198 / 256.0, 198 / 256.0, 198 / 256.0), - "lum90chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "lum90chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "lum90chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "lum90chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "lum90chrM": (174 / 256.0, 174 / 256.0, 122 / 256.0), - "lum90chr0": (174 / 256.0, 174 / 256.0, 122 / 256.0), - "lum90chrUn": (108 / 256.0, 191 / 256.0, 38 / 256.0), - "lum90chrNA": (171 / 256.0, 171 / 256.0, 171 / 256.0), - "lum80chr1": (244 / 256.0, 188 / 256.0, 127 / 256.0), - "lum80chr2": (202 / 256.0, 202 / 256.0, 136 / 256.0), - "lum80chr3": (203 / 256.0, 203 / 256.0, 103 / 256.0), - "lum80chr4": (255 / 256.0, 137 / 256.0, 137 / 256.0), - "lum80chr5": (255 / 256.0, 116 / 256.0, 116 / 256.0), - "lum80chr6": (255 / 256.0, 119 / 256.0, 255 / 256.0), - "lum80chr7": (237 / 256.0, 186 / 256.0, 186 / 256.0), - "lum80chr8": (255 / 256.0, 174 / 256.0, 62 / 256.0), - "lum80chr9": (243 / 256.0, 192 / 256.0, 0 / 256.0), - "lum80chr10": (206 / 256.0, 206 / 256.0, 0 / 256.0), - "lum80chr11": (166 / 256.0, 216 / 256.0, 0 / 256.0), - "lum80chr12": (0 / 256.0, 232 / 256.0, 0 / 256.0), - "lum80chr13": (146 / 256.0, 216 / 256.0, 126 / 256.0), - "lum80chr14": (186 / 256.0, 186 / 256.0, 255 / 256.0), - "lum80chr15": (152 / 256.0, 196 / 256.0, 255 / 256.0), - "lum80chr16": (152 / 256.0, 203 / 256.0, 254 / 256.0), - "lum80chr17": (0 / 256.0, 224 / 256.0, 224 / 256.0), - "lum80chr18": (156 / 256.0, 208 / 256.0, 208 / 256.0), - "lum80chr19": (250 / 256.0, 161 / 256.0, 255 / 256.0), - "lum80chr20": (255 / 256.0, 146 / 256.0, 255 / 256.0), - "lum80chr21": (227 / 256.0, 177 / 256.0, 255 / 256.0), - "lum80chr22": (198 / 256.0, 198 / 256.0, 198 / 256.0), - "lum80chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "lum80chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "lum80chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "lum80chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "lum80chrM": (174 / 256.0, 174 / 256.0, 122 / 256.0), - "lum80chr0": (174 / 256.0, 174 / 256.0, 122 / 256.0), - "lum80chrUn": (108 / 256.0, 191 / 256.0, 38 / 256.0), - "lum80chrNA": (171 / 256.0, 171 / 256.0, 171 / 256.0), - "vlpurple": (218 / 256.0, 218 / 256.0, 235 / 256.0), - "vlorange": (253 / 256.0, 208 / 256.0, 162 / 256.0), - "vlpgreen": (218 / 256.0, 218 / 256.0, 235 / 256.0), -} - - -ecolor = { - "interchromosomal": "blue", - "concordant": "black", - "everted": (139 / 256.0, 69 / 256.0, 19 / 256.0), # 'brown', yellow', - "forward": "magenta", - "reverse": (0 / 256.0, 139 / 256.0, 139 / 256.0), #'cyan', - "discordant": "red", -} diff --git a/bin/realigner.py b/bin/realigner.py old mode 100644 new mode 100755 diff --git a/bin/ref_util.py b/bin/ref_util.py deleted file mode 100755 index 1ebc1cd5..00000000 --- a/bin/ref_util.py +++ /dev/null @@ -1,832 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Maintained by Jens Luebeck, jluebeck@ucsd.edu -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - - -##This is a suite to load reference genome (not just hg19, as filename implies), genes, exons, repeat content and perform operations on this genome, compare variants -## it handles annotations from a database and is not restricted to solely hg19 if global_names.REF is not hg19. - -import sys -from bisect import bisect_left -from collections import defaultdict -from time import time -import pysam -import heapq -import copy -import os -import logging - -import global_names - -REF = global_names.REF -TSTART = global_names.TSTART -print("Global ref name is " + REF) - -try: - DATA_REPO = os.environ["AA_DATA_REPO"] -except: - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + " Unable to set AA_DATA_REPO variable. Setting to working directory" - ) - DATA_REPO = "." -if DATA_REPO == "." or DATA_REPO == "": - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + " AA_DATA_REPO not set or empy. Setting to working directory" - ) - DATA_REPO = "." - -REF_files = defaultdict(lambda: "", {}) -try: - for l in open(DATA_REPO + "/" + REF + "/file_list.txt"): - REF_files[l.strip().split()[0]] = l.strip().split()[1] -except: - logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " Unable to find reference in $AA_DATA_REPO/REF/file_list.txt. Setting to empty." - ) - - -class fake_fasta(object): - def fetch(self, a=None, b=0, c=0): - return "".join(["N" for i in range(c - b + 1)]) - - -try: - fa_file = pysam.Fastafile(DATA_REPO + "/" + REF + "/" + REF_files["fa_file"]) -except: - logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + ' Unable to open fasta file: "' - + DATA_REPO - + "/" - + REF - + "/" - + REF_files["fa_file"] - + '". Reference sequences will be set to N.' - ) - fa_file = fake_fasta() - -chrLen_filename = DATA_REPO + "/" + REF + "/" + REF_files["chrLen_file"] -duke35_filename = DATA_REPO + "/" + REF + "/" + REF_files["duke35_filename"] -wgexclude_filename = DATA_REPO + "/" + REF + "/" + REF_files["mapability_exclude_filename"] -gene_filename = DATA_REPO + "/" + REF + "/" + REF_files["gene_filename"] -exon_filename = DATA_REPO + "/" + REF + "/" + REF_files["exon_file"] -oncogene_filename = DATA_REPO + "/" + REF + "/" + REF_files["oncogene_filename"] -centromere_filename = DATA_REPO + "/" + REF + "/" + REF_files["centromere_filename"] -conserved_regions_filename = DATA_REPO + "/" + REF + "/" + REF_files["conserved_regions_filename"] -segdup_filename = DATA_REPO + "/" + REF + "/" + REF_files["segdup_filename"] -complementary_nucleotide = defaultdict( - lambda: "N", {"A": "T", "C": "G", "G": "C", "T": "A", "a": "t", "c": "g", "g": "c", "t": "a", "n": "n", "N": "N"} -) -duke35 = [] -duke35_exists = [True] - -# Handling chromosome names, lengths, sorting, positions and addition of new chromosomes -chr_id = {} -chrName = {} -chromList = [str(x) for x in range(1, 23)] + ["X" + "Y"] # must be updated if including an organism with more chroms. - - -def chrNum(chrname, mode="append"): - if chrname in chr_id: - return chr_id[chrname] - else: - if mode == "init": - cnum = len(chr_id) - else: - cnum = 1000000 + len(chr_id) - chr_id[chrname] = cnum - chrName[cnum] = chrname - return chr_id[chrname] - - -chrLen = defaultdict(lambda: 0, {}) -try: - for line in open(chrLen_filename): - ll = line.strip().split() - chrLen[chrNum(ll[0], mode="init")] = int(ll[1]) -except: - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + ' Unable to open chromosome lengths file: "' + chrLen_filename + '"' - ) - -chrOffset = {} - - -def absPos(chrname, pos=0): - cnum = chrNum(chrname) - if chrNum(chrname) not in chrOffset: - chrkeys = sorted(chrName.keys()) - sumlen = sum([chrLen[c] for c in chrLen if c in chrOffset]) - for i in range(len(chrkeys)): - if chrkeys[i] not in chrOffset: - chrOffset[chrkeys[i]] = sumlen - sumlen += chrLen[chrkeys[i]] - if cnum < chrkeys[i]: - break - return chrOffset[chrNum(chrname)] + pos - - -for c in chrLen: - ap = absPos(chrName[c]) - - -def chrPos(abspos): - for c in chrOffset: - if chrOffset[c] < abspos and chrOffset[c] + chrLen[c] >= abspos: - return (chrName[c], abspos - chrOffset[c]) - return None - - -def update_chrLen(len_list): - for l in len_list: - chrLen[chrNum(l[0])] = int(l[1]) - for l in len_list: - cpos = absPos(l[0], 1) - - -def reverse_complement(seq): - return "".join([complementary_nucleotide[a] for a in seq][::-1]) - - -class interval(object): - def __init__( - self, line, start=-1, end=-1, strand=1, file_format="", bamfile=None, info="", exclude_info_string=False - ): - self.info = "" - self.file_format = file_format - if type(line) == pysam.AlignedRead or type(line) == pysam.AlignedSegment: - self.load_pysamread(line, bamfile) - elif start == -1: - self.load_line(line, file_format, exclude_info_string=exclude_info_string) - elif end == -1: - self.load_pos(line, start, start, strand) - else: - self.load_pos(line, start, end, strand) - if len(info) > 0: - self.info = info - - def load_line(self, line, file_format, exclude_info_string=False): - if file_format == "": - if len(line.strip().split()) == 1: - self.chrom = line.split(":")[0] - self.start = int(line.split(":")[1].split("-")[0]) - if "-" not in line: - self.end = int(line.split(":")[1].split("-")[0]) - else: - self.end = int(line.split(":")[1].split("-")[1]) - if self.start < self.end: - self.strand = 1 - else: - self.strand = -1 - return - else: - file_format = "bed" - if file_format == "gff": - ll = line.strip().split() - self.chrom = ll[0] - self.start, self.end = sorted([int(float(ll[3])), int(float(ll[4]))]) - if ll[6] == "+": - self.strand = 1 - else: - self.strand = -1 - if not exclude_info_string: - self.info = {r[0 : r.find("=")]: r[r.find("=") + 1 :] for r in ll[8].strip().strip(";").split(";")} - self.info["Variant"] = ll[5] - elif file_format == "bed": - ll = line.strip().split() - self.chrom = ll[0] - if (REF == "hg19" or REF == "GRCh38" or REF == "mm10" or REF == "GRCm38") and 0 < len(self.chrom) < 3: - try: - ci = int(self.chrom) - if 0 < ci < 23: - self.chrom = "chr" + self.chrom - logging.info("Corrected chromosome name (appended 'chr') " + self.chrom + " \n") - - except ValueError: - if self.chrom in {"M", "X", "Y"}: - self.chrom = "chr" + self.chrom - else: - logging.warning("Chromosome name " + self.chrom + " may be incompatible") - - self.start, self.end = sorted([int(float(ll[1])), int(float(ll[2]))]) - if int(float(ll[2])) >= int(float(ll[1])): - self.strand = 1 - else: - self.strand = -1 - if not exclude_info_string: - self.info = ll[3:] - else: - raise (Exception("Invalid interval format" + str(line))) - - def load_pos(self, chrom, start, end, strand): - self.chrom = chrom - self.start = int(start) - self.end = int(end) - self.strand = strand - if start > end: - self.start = int(end) - self.end = int(start) - self.strand = -1 * strand - - def load_pysamread(self, line, bamfile): - if bamfile is None: - raise Exception("Interval of pysam AlignedRead without bamfile") - self.chrom = line.reference_name - self.start = line.reference_start - self.end = 0 - if line.reference_end is not None: - self.end = line.reference_end - else: - logging.warning("Reference_end for " + str(self) + " was NoneType. Setting to 0.") - - if line.is_reverse: - self.strand = -1 - else: - self.strand = 1 - - def __gt__(self, y): - if self.chrom != y.chrom: - return chrNum(self.chrom) > chrNum(y.chrom) - elif int(self.end) != int(y.end): - return int(self.end) > int(y.end) - else: - return int(self.start) > int(y.start) - - def size(self): - return self.end - self.start + 1 - - def __str__(self): - if len(str(self.info)) == 0: - return "\t".join(map(str, [self.chrom, self.start, self.end])) - elif type(self.info) == list: - return "\t".join(map(str, [self.chrom, self.start, self.end] + list(self.info))) - elif type(self.info) == dict: - return "\t".join( - map(str, [self.chrom, self.start, self.end] + [str(s) + "=" + str(self.info[s]) for s in self.info]) - ) - else: - return "\t".join(map(str, [self.chrom, self.start, self.end, self.info])) - - def gc_content(self): - seq = fa_file.fetch(self.chrom, self.start, self.end) - # if 'G' in seq: - # print seq, seq.count('G'), seq.count('C'), float(seq.count('G') + seq.count('C')) / len(seq) - # exit() - if len(seq) == 0: - return 0.5 - return float(seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c")) / len(seq) - - def sequence(self, new_fa_file=None): - if new_fa_file is not None: - seq = new_fa_file.fetch(self.chrom, self.start, self.end + 1) - else: - seq = fa_file.fetch(self.chrom, self.start, self.end + 1) - if self.strand == 1: - return seq - else: - return "".join([complementary_nucleotide[a] for a in seq][::-1]) - - def intersects(self, n, extend=0, margin=0.0): - if margin > 0.0: - if self.intersects( - interval(n.chrom, n.start, n.end - (1 - margin) * (n.end - n.start)) - ) and self.intersects(interval(n.chrom, n.start + (1 - margin) * (n.end - n.start)), n.end): - return True - else: - s = self - if n.intersects(interval(s.chrom, s.start, s.end - (1 - margin) * (s.end - s.start))) and n.intersects( - interval(s.chrom, s.start + (1 - margin) * (s.end - s.start)), s.end - ): - return True - return False - a = [self.chrom, max(0, self.start - extend), self.end + extend] - b = [n.chrom, n.start, n.end] - if a[0] != b[0]: - return False - if (int(a[1]) - int(b[1])) * (int(a[2]) - int(b[1])) <= 0: - return True - if (int(a[1]) - int(b[2])) * (int(a[2]) - int(b[2])) <= 0: - return True - if (int(a[1]) - int(b[1])) * (int(a[1]) - int(b[2])) <= 0: - return True - if (int(a[2]) - int(b[1])) * (int(a[2]) - int(b[2])) <= 0: - return True - return False - - def intersection(self, y): - if not self.intersects(y): - return None - return interval(self.chrom, max(self.start, y.start), min(self.end, y.end)) - - def merge(self, y, extend=0): - if not self.intersects(y, extend): - return None - return interval(self.chrom, min(self.start, y.start), max(self.end, y.end)) - - def atomize(self, y): - il = interval_list([self, y]) - il.sort() - ilr = [] - if il[0].intersects(il[1]): - ilint = il[0].intersection(il[1]) - if il[0].start < il[1].start: - ilr.append((interval(il[0].chrom, il[0].start, ilint.start - 1), [il[0]])) - elif il[1].start < il[0].start: - ilr.append((interval(il[1].chrom, il[1].start, ilint.start - 1), [il[1]])) - ilr.append((ilint, il)) - if il[0].end > il[1].end: - ilr.append((interval(il[0].chrom, ilint.end + 1, il[0].end), [il[0]])) - elif il[1].end > il[0].end: - ilr.append((interval(il[1].chrom, ilint.end + 1, il[1].end), [il[1]])) - return ilr - else: - return [(il[0], [il[0]]), (il[1], [il[1]])] - - def contains(self, x, y=-1, z=-1): - if type(x) == interval: - if self.intersects(x) and self.intersection(x).size() == x.size(): - return True - else: - return False - if y != -1: - if z == -1: - z = y - if ( - self.intersects(interval(x, y, z)) - and self.intersection(interval(x, y, z)).size() == interval(x, y, z).size() - ): - return True - return False - - def filter_repeat(self): - if len(interval_list([self]).intersection(wgexclude)) > 0: - return True - if len(interval_list([self]).intersection(conserved_regions)) > 0: - return True - if self.rep_content() > 4.5: - return True - return False - - def rep_content(self): - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: init ") - if self.chrom == "chrM" or self.chrom == "MT": - return 5.0 - if self.chrom.strip("chr") not in chromList: - return 1.0 - s34 = interval(self.chrom, self.start, max(self.start, self.end - 34)) - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: to load duke ") - if duke35_exists[0] and len(duke35) == 0: - try: - duke35file = open(duke35_filename) - duke35.extend([l.strip() for l in duke35file]) - duke35file.close() - except: - logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + ' rep_content: Unable to open mapability file "' - + duke35_filename - + '".' - ) - duke35_exists[0] = False - duke35.extend(["chr_Un 0 1 1"]) - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: duke loaded") - ictime = 0 - itime = 0 - hi = len(duke35) - 1 - lo = 0 - numiter = 0 - while hi - lo > 1: - numiter += 1 - p = (hi + lo) // 2 - ctime = time() - m = interval(duke35[p]) - ictime += time() - ctime - ctime = time() - if s34.intersects(m) or m > s34: - hi = p - else: - lo = p - itime += time() - ctime - p = lo - m = interval(duke35[p]) - sum_duke = 0 - len_duke = 0 - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: found " + str(numiter) + " " + str(ictime) + " " + str(itime)) - while s34 > m or s34.intersects(m): - if not s34.intersects(m): - p += 1 - if p >= len(duke35) or p <= 0: - raise Exception( - "p index out of range: " - + str(p) - + " " - + str(lo) - + " " - + str(self) - + " " - + str(m) - + " " - + str(interval(duke35[lo])) - ) - m = interval(duke35[p]) - continue - repc = 5.0 if float(m.info[0]) == 0 else 1.0 / float(m.info[0]) - sum_duke += s34.intersection(m).size() * repc - len_duke += s34.intersection(m).size() - p += 1 - if p >= len(duke35): - break - m = interval(duke35[p]) - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: done") - # exit() - if len_duke > 0: - return sum_duke / len_duke - else: - return 1.0 - - def num_unmasked(self): - if self.chrom not in fa_file.references: - return self.size() - seq = fa_file.fetch(self.chrom, self.start, self.end) - return len([c for c in seq if c in "ACGT"]) - - def segdup_uniqueness(self): - sl = interval_list([self]).intersection(segdup_list) - slsd = sum([self.intersection(i[1]).size() for i in sl]) - return float(self.size()) / (self.size() + slsd) - - def extend(self, extend_len=0): - return interval( - self.chrom, - max(0, self.start - extend_len), - min(self.end + extend_len, chrLen[chrNum(self.chrom)]), - self.strand, - ) - - -class interval_list(list, object): - def __init__(self, ilist=None, file_format=None, sort=True, exclude_info_string=False): - if ilist == None: - ilist = [] - self.file_format = file_format - if file_format in ["bed", "gff"]: - self.bed_to_list(ilist, exclude_info_string=exclude_info_string) - if file_format is None: - list.__init__(self, ilist) - if sort: - self.sort() - self.offset = None - - def bed_to_list(self, file_name, exclude_info_string=False): - if file_name is not None: - try: - f = open(file_name) - list.__init__( - self, - [ - interval(l, file_format=self.file_format, exclude_info_string=exclude_info_string) - for l in f - if len(l.strip().split()) > 2 and l.strip()[0] != "#" - ], - ) - f.close() - except: - logging.error( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + ' interval_list: Unable to open interval file "' - + file_name - + '".' - ) - - def merge_clusters(self, extend=0, margin=0.0): - ml = [] - ci = None - cl = [] - ai = 0 - cend = len(self) - for a in self[::-1]: - ai += 1 - if ci is None or not a.intersects(ci, extend, margin): - cstart = len(self) - ai + 1 - cl = self[cstart:cend] - if ci is not None: - ml.append((ci, cl)) - ci = a - cl = [] - cend = len(self) - ai + 1 - # if ai != sum([len(m[1]) for m in ml]) + 1: - # print "divergent", ai, str(a) - # exit() - ci = ci.merge(a, extend) - # cl.append(a) - cstart = 0 - cl = self[cstart:cend] - if ci is not None: - ml.append((ci, cl)) - return ml[::-1] - - def repeats(self, count=1): - activeq = [] - if activeq is None: - print("h1") - exit() - jinterval = None - ilist = [] - for a in self[::-1]: - while len(activeq) > 0 and not a.intersects(activeq[0][1]): - heapq.heappop(activeq) - if activeq is None: - print("h2") - exit() - if len(activeq) < count and jinterval is not None: - ilist.append((jinterval, copy.copy(aq))) - if activeq is None: - print("h3") - exit() - jinterval = None - heapq.heappush(activeq, (-1 * a.start, a)) - if len(activeq) >= count: - if jinterval is None: - jinterval = interval(a.chrom, activeq[0][1].start, a.end) - aq = copy.copy(activeq) - else: - jinterval.start = min(jinterval.start, activeq[0][1].start) - heapq.heappush(aq, (-1 * a.start, a)) - if jinterval is not None: - ilist.append((jinterval, copy.copy(aq))) - jinterval = None - return ilist[::-1] - - def intersection(self, l2, extend=0): - si = len(self) - 1 - l2i = len(l2) - 1 - sj = len(self) - 1 - l2j = len(l2) - 1 - il = [] - while si >= 0: - while l2i >= 0 and l2[l2i] > self[si] and not self[si].intersects(l2[l2i], extend=extend): - l2i -= 1 - l2j = l2i - while l2j >= 0 and self[si].intersects(l2[l2j], extend=extend): - il.append((self[si], l2[l2j])) - l2j -= 1 - si -= 1 - return il[::-1] - - def atomize(self, h2): - i = 0 - j = 0 - atomlist = [] - if len(self) > 0: - c1 = self[0] - if len(h2) > 0: - c2 = h2[0] - c = None - while i < len(self) or j < len(h2): - # if c is not None: - # print "%%", i, j, str(c[0]), [str(aa) for aa in c[1]], [str(aa[0]) for aa in atomlist] - # else: - # print "%%", i, j, [], [str(aa[0]) for aa in atomlist] - if c is not None: - if i < len(self) and self[i] not in c[1] and (self[i].intersects(c[0], -1) or c[0] > self[i]): - atm = self[i].atomize(c[0]) - atm = [(aa[0], [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]]) for aa in atm] - # print "%i", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] - c = atm[-1] - i += 1 - atomlist += atm[:-1] - elif j < len(h2) and h2[j] not in c[1] and (h2[j].intersects(c[0], -1) or c[0] > h2[j]): - # print j, str(h2[j]), str(c[0]), c[0] > h2[j] - atm = c[0].atomize(h2[j]) - atm = [(aa[0], [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]]) for aa in atm] - # print "%j", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] - c = atm[-1] - j += 1 - atomlist += atm[:-1] - else: - atomlist.append(c) - # if i < len(self) and self[i] in c[1]: - # i += 1 - # if j < len(h2) and h2[j] in c[1]: - # j += 1 - c = None - else: - if i >= len(self): - atomlist.append((h2[j], [h2[j]])) - j += 1 - elif j >= len(h2): - atomlist.append((self[i], [self[i]])) - i += 1 - else: - atm = self[i].atomize(h2[j]) - atomlist += atm[:-1] - c = atm[-1] - # if self[i] not in c[1]: - i += 1 - # if h2[j] not in c[1]: - j += 1 - if c is not None: - atomlist.append(c) - return atomlist - - def get_repeat_content(self): - try: - duke35_file = open(duke35_filename) - print("counting repeats", time()) - self.sort() - sum_duke = [0.0 for i in self] - len_duke = [0.0 for i in self] - lno = 0 - i = 0 - j = 0 - for line in duke35_file: - lno += 1 - duke_int = interval(line) - while not (duke_int.intersects(self[i])) and duke_int > self[i]: - i += 1 - if not duke_int.intersects(self[i]) and self[i] > duke_int: - continue - j = i - repc = 5.0 if float(duke_int.info[0]) == 0 else 1 / float(duke_int.info[0]) - while j < len(self) and self[j].intersects(duke_int): - sum_duke[j] += self[j].intersection(duke_int).size() * repc - len_duke[j] += self[j].intersection(duke_int).size() - j += 1 - duke35_file.close() - return {self[i]: sum_duke[i] / len_duke[i] for i in range(len(interval_list))} - except: - logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + ' get_repeat_content: Unable to open mapability file "' - + duke35_filename - + '".' - ) - duke35_exists[0] = False - duke35.extend(["chr_Un 0 1 1"]) - return {self[i]: 1.0 for i in range(len(interval_list))} - - def offsets(self): - if self.offset is not None: - return self.offset - gap = 0.1 - hratio = 0.8 - - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) - h_count = len(self) - v_count - h_sum = sum([i.size() for i in hlist]) - v_sum = sum([i.size() for i in vlist]) - - hK = len([i for i in hlist if i.size() < h_sum * gap / max(1, h_count)]) - hS = sum([i.size() for i in hlist if i.size() > h_sum * gap / max(1, h_count)]) - min_hsize = hS / (max(1, h_count) / gap - hK) - h_sum = hS + hK * min_hsize - - vK = len([i for i in vlist if i.size() < v_sum * gap / max(1, v_count)]) - vS = sum([i.size() for i in vlist if i.size() > v_sum * gap / max(1, v_count)]) - min_vsize = vS / (max(1, v_count) / gap - vK) - v_sum = vS + vK * min_vsize - - offset = {} - - h_start = 0 - hscale = 1 if v_count == 0 else hratio - v_start = 0 if h_count == 0 else hratio - vscale = 1 if h_count == 0 else (1 - hratio) - - hgap = gap / h_count if h_count > 0 else 0 - vgap = gap / v_count if v_count > 0 else 0 - hpos = h_start + (hgap / 2) * hscale - vpos = v_start + (vgap / 2) * vscale - for i in hlist: - isize = max(i.size(), min_hsize) - offset[i] = (hpos, hpos + ((1 - gap) * isize / h_sum) * hscale) - hpos = hpos + ((1 - gap) * isize / h_sum + hgap) * hscale - for i in vlist: - isize = max(i.size(), min_vsize) - offset[i] = (vpos, vpos + ((1 - gap) * isize / v_sum) * vscale) - vpos = vpos + ((1 - gap) * isize / v_sum + vgap) * vscale - self.offset = offset - # for i in self: - # print str(i), offset[i], i.size(), hgap, h_sum, hscale, gap, hpos, vpos - # exit() - return offset - - def xpos(self, chrom, pos): - offset = self.offsets() - for i in self: - if i.intersects(interval(chrom, max(0, pos - 1), pos)): - o = offset[i] - return (o[1] * (pos - i.start) + o[0] * (i.end - pos)) / (i.end - i.start) - return None - - def offset_breaks(self): - offset = self.offsets() - gap = 0.1 - hratio = 0.8 - - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) - h_count = len(self) - v_count - h_sum = sum([i.size() for i in hlist]) - v_sum = sum([i.size() for i in vlist]) - - hscale = 1 if v_count == 0 else hratio - vscale = 1 if h_count == 0 else (1 - hratio) - - hgap = gap / h_count if h_count > 0 else 0 - vgap = gap / v_count if v_count > 0 else 0 - - breaks = [] - iprev = None - for i in self: - if iprev is None: - iprev = i - continue - if i in hlist and iprev.chrom == i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, ":", i.chrom)) - print(str(i), str(iprev), i in hlist, iprev.chrom == i.chrom) - elif i in hlist and iprev.chrom != i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, "--", i.chrom)) - elif i in vlist and iprev in hlist: - breaks.append((offset[i][0] - vscale * vgap / 2, "-", i.chrom)) - elif i in vlist and i.chrom == iprev.chrom: - breaks.append((offset[i][0] - vscale * vgap / 2, ":", i.chrom)) - else: - breaks.append((offset[i][0] - vscale * vgap / 2, "--", i.chrom)) - - iprev = i - return breaks - - def __str__(self): - return str(([str(i) for i in self])) - - -oncogene_list = interval_list(oncogene_filename, "gff") -oncogene_list.sort() -gene_list = interval_list(gene_filename, "gff") - - -exon_list = interval_list([]) - - -def load_exons(): - if len(exon_list) > 0: - return - try: - exon_file = open(exon_filename) - exonFields = [ - interval(j, file_format="gff") - for j in exon_file.read().strip().split("\n") - if ( - len(j.strip()) > 0 - and j.strip()[0] != "#" - and {r.split("=")[0]: r.split("=")[1] for r in j.strip().split()[8].strip(";").split(";")}["color"] - == "000080" - ) - ] - exon_file.close() - exon_list.extend((exonFields)) - except: - logging.warning("#TIME " + "%.3f\t" % (time() - TSTART) + 'unable to load exon file: "' + exon_filename + '"') - - -conserved_regions = interval_list(conserved_regions_filename, "bed") -conserved_regions.sort() - -wgexclude = interval_list(wgexclude_filename, "bed") -wgexclude.sort() - -centromere_list = interval_list(centromere_filename, "bed") -centromere_list.sort() -centromere_list = interval_list([i[0] for i in centromere_list.merge_clusters(extend=1)]) - - -segdup_list = interval_list(segdup_filename, "bed") -segdup_list.sort() diff --git a/bin/repeats.py b/bin/repeats.py old mode 100644 new mode 100755 diff --git a/bin/sample_metadata_skeleton.json b/bin/sample_metadata_skeleton.json deleted file mode 100644 index 9c3e7863..00000000 --- a/bin/sample_metadata_skeleton.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "sample_type": "", - "sample_source": "", - "tissue_of_origin": "", - "sample_description": "" -} \ No newline at end of file diff --git a/bin/simulations.py b/bin/simulations.py old mode 100644 new mode 100755 diff --git a/bin/utils.py b/bin/utils.py old mode 100644 new mode 100755 diff --git a/conf/modules.config b/conf/modules.config index d60c33bd..4f83dee2 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -56,6 +56,15 @@ process { ] } + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } // @@ -307,131 +316,73 @@ process { } // -// AmpliconArchitect Options +// AmpliconSuite Options // process { - withName: 'CNVKIT_BATCH' { - ext.args = "--method wgs" - publishDir = [ - path: { "${params.outdir}/cnvkit" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - - withName: 'CNVKIT_SEGMENT' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/cnvkit" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'PREPAREAA' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/prepareaa" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'COLLECT_SEEDS' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/ampliconarchitect/cnvkit" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'AMPLIFIED_INTERVALS' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/ampliconarchitect/cnvkit" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'AMPLICONARCHITECT_AMPLICONARCHITECT' { + withName: 'AMPLICONSUITE' { time = '96.h' ext.args = "" publishDir = [ [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/sv_view" }, + path: { "${params.outdir}/ampliconsuite/cnvkit" }, mode: params.publish_dir_mode, - pattern: '*.{png,pdf}', + pattern: '*{CNV_SEEDS.bed,pre_filtered.bed,call.cns,cnr.gz,md.cns,CALLS.bed}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/amplicons" }, + path: { "${params.outdir}/ampliconsuite/ampliconarchitect/logs" }, mode: params.publish_dir_mode, - pattern: '*{graph.txt,cycles.txt}', + pattern: '*{logs.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/logs" }, + path: { "${params.outdir}/ampliconsuite/logs" }, mode: params.publish_dir_mode, - pattern: '*logs.txt', + pattern: '*{log.txt,sample_metadata.json,run_metadata.json,finish_flag.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/cnseg" }, + path: { "${params.outdir}/ampliconsuite/ampliconarchitect/" }, mode: params.publish_dir_mode, - pattern: '*cnseg.txt', + pattern: '*{summary.txt,graph.txt,cycles.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/summary" }, + path: { "${params.outdir}/ampliconsuite/ampliconarchitect/intermediate" }, mode: params.publish_dir_mode, - pattern: '*summary.txt', + pattern: '*{cnseg.txt,edges_cnseg.txt,.out}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/ampliconsuite/ampliconarchitect/sv_view" }, + mode: params.publish_dir_mode, + pattern: '*{.png,.pdf}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], - - ] - } - withName: 'AMPLICONCLASSIFIER_AMPLICONCLASSIFIER' { - ext.args = "--report_complexity --verbose_classification --plotstyle 'individual'" - publishDir = [ - path: { "${params.outdir}/ampliconclassifier/ampliconclassifier" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'AMPLICONCLASSIFIER_AMPLICONSIMILARITY' { - ext.args = "" - publishDir = [ [ - path: { "${params.outdir}/ampliconclassifier/ampliconclassifier/input" }, + path: { "${params.outdir}/ampliconsuite/ampliconclassifier/input" }, mode: params.publish_dir_mode, pattern: '*.input', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconclassifier/ampliconsimilarity/log" }, + path: { "${params.outdir}/ampliconsuite/ampliconclassifier/result" }, mode: params.publish_dir_mode, - pattern: '*.log', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*{result_data.json,result_table.tsv}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconclassifier/ampliconsimilarity/similarity" }, + path: { "${params.outdir}/ampliconsuite/ampliconclassifier/" }, mode: params.publish_dir_mode, - pattern: '*_similarity_scores.tsv', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: '*{ecDNA_counts.tsv,context_calls.tsv,basic_properties.tsv,gene_list.tsv,feature_entropy.tsv,classification_profiles.tsv}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/ampliconsuite/ampliconclassifier/amplicon_information" }, + mode: params.publish_dir_mode, + pattern: '*{SV_summary.tsv,annotated_cycles.txt,intervals.bed}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], - ] - } - withName: 'AMPLICONCLASSIFIER_MAKEINPUT' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/ampliconclassifier/makeinput" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true ] } withName: 'AMPLICONCLASSIFIER_MAKERESULTSTABLE' { @@ -564,13 +515,14 @@ process { if (!params.skip_multiqc) { process { withName: 'MULTIQC' { - ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test.config b/conf/test.config index 40dfeffa..0f9e9d13 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,7 +23,6 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/samplesheet/samplesheet.csv' input_format = 'FASTQ' - // Genome references fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/reference/genome.fa' circle_identifier = 'circexplorer2,circle_finder,circle_map_realign,circle_map_repeats,unicycler' @@ -32,7 +31,5 @@ params { // Needed input skip_markduplicates = false - mosek_license_dir = "https://raw.githubusercontent.com/nf-core/test-datasets/circdna/mosek/mosek.lic" - aa_data_repo = "data_repo" reference_build = "GRCh38" } diff --git a/conf/test_AA.config b/conf/test_AA.config index 0acbb061..32d13a28 100644 --- a/conf/test_AA.config +++ b/conf/test_AA.config @@ -23,9 +23,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/samplesheet/samplesheet.csv' - - // Outdir - outdir = "./results" + input_format = 'FASTQ' // Genome references fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/reference/genome.fa' @@ -34,8 +32,8 @@ params { igenomes_ignore = true cnvkit_cnn = "https://raw.githubusercontent.com/nf-core/test-datasets/circdna/cnvkit/dummy_file.cnn" - mosek_license_dir = "https://raw.githubusercontent.com/nf-core/test-datasets/circdna/mosek/mosek.lic" - aa_data_repo = "data_repo" + mosek_license_dir = "worfklows" + aa_data_repo = "workflows" reference_build = "GRCh38" skip_qc = true } diff --git a/conf/test_AA_local.config b/conf/test_AA_local.config new file mode 100644 index 00000000..e3bd5384 --- /dev/null +++ b/conf/test_AA_local.config @@ -0,0 +1,36 @@ + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/circdna -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test AmpliconArchitect profile with small bam file.' + config_profile_description = 'Test AmpliconArchitect usability with a small low-coverage BAM file (~760MB) of COLO320DM' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 4 + max_memory = '12.GB' + max_time = '12.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/samplesheet/test_AA_local.csv' + input_format = 'BAM' + + // Genome references + circle_identifier = 'ampliconarchitect' + skip_markduplicates = true + igenomes_ignore = true + + fasta = "/mnt/data_disk/R01/dschreye/genome.fa" + genome = "GRCh38" + reference_build = "GRCh38" + skip_qc = true +} diff --git a/conf/test_full.config b/conf/test_full.config index 18294b84..9930e2a6 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,8 +10,6 @@ ---------------------------------------------------------------------------------------- */ -cleanup = true - params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' @@ -23,6 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/samplesheet/samplesheet.csv' + input_format = 'FASTQ' // Outdir outdir = "./results" diff --git a/docs/images/nf-core-circdna_logo_dark.png b/docs/images/nf-core-circdna_logo_dark.png index 232f32be..727451ee 100644 Binary files a/docs/images/nf-core-circdna_logo_dark.png and b/docs/images/nf-core-circdna_logo_dark.png differ diff --git a/docs/images/nf-core-circdna_logo_light.png b/docs/images/nf-core-circdna_logo_light.png index 1c5283cd..6d730e09 100644 Binary files a/docs/images/nf-core-circdna_logo_light.png and b/docs/images/nf-core-circdna_logo_light.png differ diff --git a/docs/output.md b/docs/output.md index 01cb2072..4ed9d0d9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -38,7 +38,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +:::note +The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: ### TrimGalore @@ -274,7 +276,27 @@ This Branch utilises the ability of [Unicycler](https://github.com/rrwick/Unicyc ### Branch: `ampliconarchitect` -This pipeline branch `ampliconarchitect` is only usable with WGS data. This branch uses the utility of [PrepareAA](https://github.com/jluebeck/Prepare) to collect amplified seeds from copy number calls, which will be then fed to [AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) to characterise amplicons in each given sample. +This pipeline branch `ampliconarchitect` is only usable with WGS data. This branch uses the utility of the [AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) to call copy numbers using [CNVkit](https://cnvkit.readthedocs.io/en/stable/), collect amplified seeds from copy number calls, calls amplicons by using [AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect), and classifies these amplicons using[AmpliconClassifier](https://github.com/jluebeck/AmpliconClassifier). + +#### **AmpliconSuite-Pipeline** + +[AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) is performing all necessary steps to call copy numbers and amplicons from WGS data. + +
+Output files + +**Output directory: `results/ampliconsuite/logs`** + +- `[SAMPLE]_run_metadata.json` + - `json` file describing the run metadata with all necessary information of software versions , prameters, and commands +- `[SAMPLE]_perc_timing_log.txt` + - `txt` file describing the computing times of each process in the pipeline +- `[SAMPLE]_sample_metadata.json` + - `json` file describing the sample metadata information +- `[SAMPLE]_finishing_flag.txt` + - `txt` file details if the pipeline was run correctly + +
#### **CNVkit** @@ -283,12 +305,14 @@ This pipeline branch `ampliconarchitect` is only usable with WGS data. This bran
Output files -**Output directory: `results/cnvkit`** +**Output directory: `results/ampliconsuite/cnvkit`** - `[SAMPLE]_CNV_GAIN.bed` - `bed` file containing filtered Copy Number calls - `[SAMPLE]_AA_CNV_SEEDS.bed` - `bed` file containing filtered and connected amplified regions (seeds). This is used as input for [AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) +- `[SAMPLE].md_CNV_CALLS_.bed` + - `bed` file containing copy number calls in bed format. - `[SAMPLE].cnvkit.segment.cns` - `cns` file containing copy number calls of CNVkit segment. - `[SAMPLE].cnvkit.segment.cnr` @@ -298,20 +322,20 @@ This pipeline branch `ampliconarchitect` is only usable with WGS data. This bran #### **AmpliconArchitect** -[AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) uses amplicon seeds provided by `CNVkit`and `PrepareAA`to identify different types of amplicons in each sample. +[AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) uses amplicon seeds provided by `CNVkit`and `PrepareAA` inside the [AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) to identify different types of amplicons in each sample.
Output files -**Output directory: `results/ampliconarchitect/ampliconarchitect`** +**Output directory: `results/ampliconsuite/ampliconarchitect`** - `amplicons/[SAMPLE]_[AMPLICONID]_cycles.txt` - `txt`file describing the amplicon segments - `amplicons/[SAMPLE]_[AMPLICONID]_graph.txt` - `txt` file describing the amplicon graph -- `cnseg/[SAMPLE]_[SEGMENT]_graph.txt` +- `intermediate/[SAMPLE]_[SEGMENT]_graph.txt` - `txt` file describing the copy number segmentation file -- `summary/[SAMPLE]_summary.txt` +- `[SAMPLE]_summary.txt` - `txt` file describing each amplicon with regards to breakpoints, composition, oncogene content, copy number - `sv_view/[SAMPLE]_[AMPLICONID].{png,pdf}` - `png` or `pdf` file displaying the amplicon rearrangement signature @@ -325,24 +349,28 @@ This pipeline branch `ampliconarchitect` is only usable with WGS data. This bran
Output files -**Output directory: `results/ampliconclassifier`** - -- `makeinput/ampliconclassifier.input` - - `txt` file containing the input used for `AmpliconClassifier` and `AmpliconSimilarity`. -- `ampliconclassifier/ampliconclassifier_amplicon_classification_profiles.tsv` - - `tsv` file describing the amplicon class of each amplicon for each sample. -- `ecDNA_counts/ampliconclassifier_ecDNA_counts.tsv` - - `tsv` file describing if an amplicon is circular [1 = circular, 0 = non-circular]. -- `gene_list/ampliconclassifier_gene_list.tsv` - - `tsv` file detailing the genes on each amplicon. -- `log/ampliconclassifier_stdout.log` - - `log` file -- `ampliconsimilarity/ampliconclassifier_similarity_scores.tsv` - - `tsv` file containing amplicon similarity scores calculated by `AmpliconSimilarity`. -- `bed/[SAMPLE]_amplicon[AMPLICONID]_[CLASSIFICATION]_[ID]_intervals.bed` - - `bed` files containing information about the intervals on each amplicon. `unknown` intervals were not identified to be located on the respective amplicon. -- `resultstable/ampliconclassifier_result_table.[tsv,json]` +**Output directory: `results/ampliconsuite/ampliconclassifier`** + +- `input/[SAMPLE].input` + - `txt` file containing the input used for `AmpliconClassifier`. +- `amplicon_information/[SAMPLE]_[AMPLICON]_intervals.bed` + - `bed` file containing the regions of the respective amplicon. +- `amplicon_information/[SAMPLE]_[AMPLICON]_SV_summary.tsv` + - `tsv` file detailing the SVs identified in an amplicon. +- `amplicon_information/[SAMPLE]_[AMPLICON]_annotated_cycles.txt` + - `txt` file containing the annotated cycles information of AmpliconArchitect. +- `result/[SAMPLE]_result_table.[tsv,json]`. - `tsv` or `json` file of the results table tenerated by `AmpliconClassifier` which combines the output of `AmpliconArchitect` and `AmpliconClassifier`. +- `[SAMPLE]_amplicon_classification_profiles.tsv` + - `tsv` file describing the amplicon classes. +- `[SAMPLE]_gene_list.tsv` + - `tsv` file detailing the genes within each amplicon +- `[SAMPLE]_context_calls.tsv` + - `tsv` file with context for ecDNA calls. +- `[SAMPLE]_ecDNA_counts.tsv` + - `tsv` file with the number of ecDNAs in this sample. +- `[SAMPLE]_feature_basic_properties.tsv` + - `tsv` file with the amplicon information of captured region size, median feature CN, max feature CN, and borderline flag.
@@ -379,6 +407,7 @@ The plots will show: - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - Parameters used by the pipeline run: `params.json`.
diff --git a/docs/usage.md b/docs/usage.md index c63c06b5..99e93c65 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -30,7 +30,7 @@ The two input formats accepted by the pipeline are "FASTQ" and "BAM". If not spe ### FASTQ -```bash +```csv title="samplesheet.csv" sample,fastq_1,fastq_2 circdna_1,circdna_1_R1.fastq.gz,circdna_1_R2.fastq.gz circdna_2,circdna_2_R1.fastq.gz,circdna_2_R2.fastq.gz @@ -47,7 +47,7 @@ An [example samplesheet fastq](../assets/samplesheet.csv) has been provided with ### BAM -```bash +```csv title="samplesheet.csv" sample,bam circdna_1,circdna_1.bam circdna_2,circdna_2.bam @@ -65,7 +65,7 @@ An [example samplesheet bam](../assets/samplesheet_bam.csv) has been provided wi If using FASTQ input, the `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: -```bash +```csv title="samplesheet.csv" sample,fastq_1,fastq_2 CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz @@ -88,7 +88,7 @@ The pipeline can be run from directly from bam files. Here,the samplesheet has t --input '[path to samplesheet file]' ``` -```console +```csv title="samplesheet.csv" sample,bam sample1, sample1.bam sample2, sample2.bam @@ -107,7 +107,7 @@ An [example samplesheet](../assets/samplesheet_bam.csv) has been provided with t The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/circdna --input samplesheet.csv --outdir --genome GRCh38 -profile docker --circle_identifier +nextflow run nf-core/circdna --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -120,12 +120,17 @@ work # Directory containing the nextflow working files .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. -If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. +:::tip +If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +::: Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -> The above pipeline run specified with a params file in yaml format: +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: + +The above pipeline run specified with a params file in yaml format: ```bash nextflow run nf-core/circdna -profile docker -params-file params.yaml @@ -137,7 +142,6 @@ with `params.yaml` containing: input: './samplesheet.csv' outdir: './results/' genome: 'GRCh37' -input: 'data' <...> ``` @@ -161,11 +165,15 @@ This version number will be logged in reports when you run the pipeline, so that To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +:::tip +If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +::: ## Core Nextflow arguments -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +:::note +These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +::: ### `-profile` @@ -173,7 +181,9 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +:::info +We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +::: The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy deleted file mode 100755 index 9b34804d..00000000 --- a/lib/NfcoreSchema.groovy +++ /dev/null @@ -1,530 +0,0 @@ -// -// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. -// - -import nextflow.Nextflow -import org.everit.json.schema.Schema -import org.everit.json.schema.loader.SchemaLoader -import org.everit.json.schema.ValidationException -import org.json.JSONObject -import org.json.JSONTokener -import org.json.JSONArray -import groovy.json.JsonSlurper -import groovy.json.JsonBuilder - -class NfcoreSchema { - - // - // Resolve Schema path relative to main workflow directory - // - public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') { - return "${workflow.projectDir}/${schema_filename}" - } - - // - // Function to loop over all parameters defined in schema and check - // whether the given parameters adhere to the specifications - // - /* groovylint-disable-next-line UnusedPrivateMethodParameter */ - public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { - def has_error = false - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Check for nextflow core params and unexpected params - def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text - def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') - def nf_params = [ - // Options for base `nextflow` command - 'bg', - 'c', - 'C', - 'config', - 'd', - 'D', - 'dockerize', - 'h', - 'log', - 'q', - 'quiet', - 'syslog', - 'v', - - // Options for `nextflow run` command - 'ansi', - 'ansi-log', - 'bg', - 'bucket-dir', - 'c', - 'cache', - 'config', - 'dsl2', - 'dump-channels', - 'dump-hashes', - 'E', - 'entry', - 'latest', - 'lib', - 'main-script', - 'N', - 'name', - 'offline', - 'params-file', - 'pi', - 'plugins', - 'poll-interval', - 'pool-size', - 'profile', - 'ps', - 'qs', - 'queue-size', - 'r', - 'resume', - 'revision', - 'stdin', - 'stub', - 'stub-run', - 'test', - 'w', - 'with-apptainer', - 'with-charliecloud', - 'with-conda', - 'with-dag', - 'with-docker', - 'with-mpi', - 'with-notification', - 'with-podman', - 'with-report', - 'with-singularity', - 'with-timeline', - 'with-tower', - 'with-trace', - 'with-weblog', - 'without-docker', - 'without-podman', - 'work-dir' - ] - def unexpectedParams = [] - - // Collect expected parameters from the schema - def expectedParams = [] - def enums = [:] - for (group in schemaParams) { - for (p in group.value['properties']) { - expectedParams.push(p.key) - if (group.value['properties'][p.key].containsKey('enum')) { - enums[p.key] = group.value['properties'][p.key]['enum'] - } - } - } - - for (specifiedParam in params.keySet()) { - // nextflow params - if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" - has_error = true - } - // unexpected params - def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' - def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } - def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() - def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase)) - if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) { - // Temporarily remove camelCase/camel-case params #1035 - def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} - if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ - unexpectedParams.push(specifiedParam) - } - } - } - - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Validate parameters against the schema - InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() - JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) - - // Remove anything that's in params.schema_ignore_params - raw_schema = removeIgnoredParams(raw_schema, params) - - Schema schema = SchemaLoader.load(raw_schema) - - // Clean the parameters - def cleanedParams = cleanParameters(params) - - // Convert to JSONObject - def jsonParams = new JsonBuilder(cleanedParams) - JSONObject params_json = new JSONObject(jsonParams.toString()) - - // Validate - try { - schema.validate(params_json) - } catch (ValidationException e) { - println '' - log.error 'ERROR: Validation of pipeline parameters failed!' - JSONObject exceptionJSON = e.toJSON() - printExceptions(exceptionJSON, params_json, log, enums) - println '' - has_error = true - } - - // Check for unexpected parameters - if (unexpectedParams.size() > 0) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - println '' - def warn_msg = 'Found unexpected parameters:' - for (unexpectedParam in unexpectedParams) { - warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" - } - log.warn warn_msg - log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" - println '' - } - - if (has_error) { - Nextflow.error('Exiting!') - } - } - - // - // Beautify parameters for --help - // - public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - Integer num_hidden = 0 - String output = '' - output += 'Typical pipeline command:\n\n' - output += " ${colors.cyan}${command}${colors.reset}\n\n" - Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - Integer max_chars = paramsMaxChars(params_map) + 1 - Integer desc_indent = max_chars + 14 - Integer dec_linewidth = 160 - desc_indent - for (group in params_map.keySet()) { - Integer num_params = 0 - String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (group_params.get(param).hidden && !params.show_hidden_params) { - num_hidden += 1 - continue; - } - def type = '[' + group_params.get(param).type + ']' - def description = group_params.get(param).description - def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : '' - def description_default = description + colors.dim + defaultValue + colors.reset - // Wrap long description texts - // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap - if (description_default.length() > dec_linewidth){ - List olines = [] - String oline = "" // " " * indent - description_default.split(" ").each() { wrd -> - if ((oline.size() + wrd.size()) <= dec_linewidth) { - oline += wrd + " " - } else { - olines += oline - oline = wrd + " " - } - } - olines += oline - description_default = olines.join("\n" + " " * desc_indent) - } - group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' - num_params += 1 - } - group_output += '\n' - if (num_params > 0){ - output += group_output - } - } - if (num_hidden > 0){ - output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset - } - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Groovy Map summarising parameters/workflow options used by the pipeline - // - public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') { - // Get a selection of core Nextflow workflow options - def Map workflow_summary = [:] - if (workflow.revision) { - workflow_summary['revision'] = workflow.revision - } - workflow_summary['runName'] = workflow.runName - if (workflow.containerEngine) { - workflow_summary['containerEngine'] = workflow.containerEngine - } - if (workflow.container) { - workflow_summary['container'] = workflow.container - } - workflow_summary['launchDir'] = workflow.launchDir - workflow_summary['workDir'] = workflow.workDir - workflow_summary['projectDir'] = workflow.projectDir - workflow_summary['userName'] = workflow.userName - workflow_summary['profile'] = workflow.profile - workflow_summary['configFiles'] = workflow.configFiles.join(', ') - - // Get pipeline parameters defined in JSON Schema - def Map params_summary = [:] - def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - for (group in params_map.keySet()) { - def sub_params = new LinkedHashMap() - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (params.containsKey(param)) { - def params_value = params.get(param) - def schema_value = group_params.get(param).default - def param_type = group_params.get(param).type - if (schema_value != null) { - if (param_type == 'string') { - if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { - def sub_string = schema_value.replace('\$projectDir', '') - sub_string = sub_string.replace('\${projectDir}', '') - if (params_value.contains(sub_string)) { - schema_value = params_value - } - } - if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { - def sub_string = schema_value.replace('\$params.outdir', '') - sub_string = sub_string.replace('\${params.outdir}', '') - if ("${params.outdir}${sub_string}" == params_value) { - schema_value = params_value - } - } - } - } - - // We have a default in the schema, and this isn't it - if (schema_value != null && params_value != schema_value) { - sub_params.put(param, params_value) - } - // No default in the schema, and this isn't empty - else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { - sub_params.put(param, params_value) - } - } - } - params_summary.put(group, sub_params) - } - return [ 'Core Nextflow options' : workflow_summary ] << params_summary - } - - // - // Beautify parameters for summary and return as string - // - public static String paramsSummaryLog(workflow, params) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - String output = '' - def params_map = paramsSummaryMap(workflow, params) - def max_chars = paramsMaxChars(params_map) - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - if (group_params) { - output += colors.bold + group + colors.reset + '\n' - for (param in group_params.keySet()) { - output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' - } - output += '\n' - } - } - output += "!! Only displaying parameters that differ from the pipeline defaults !!\n" - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Loop over nested exceptions and print the causingException - // - private static void printExceptions(ex_json, params_json, log, enums, limit=5) { - def causingExceptions = ex_json['causingExceptions'] - if (causingExceptions.length() == 0) { - def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ - // Missing required param - if (m.matches()) { - log.error "* Missing required parameter: --${m[0][1]}" - } - // Other base-level error - else if (ex_json['pointerToViolation'] == '#') { - log.error "* ${ex_json['message']}" - } - // Error with specific param - else { - def param = ex_json['pointerToViolation'] - ~/^#\// - def param_val = params_json[param].toString() - if (enums.containsKey(param)) { - def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices" - if (enums[param].size() > limit) { - log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )" - } else { - log.error "${error_msg}: ${enums[param].join(', ')})" - } - } else { - log.error "* --${param}: ${ex_json['message']} (${param_val})" - } - } - } - for (ex in causingExceptions) { - printExceptions(ex, params_json, log, enums) - } - } - - // - // Remove an element from a JSONArray - // - private static JSONArray removeElement(json_array, element) { - def list = [] - int len = json_array.length() - for (int i=0;i - if(raw_schema.keySet().contains('definitions')){ - raw_schema.definitions.each { definition -> - for (key in definition.keySet()){ - if (definition[key].get("properties").keySet().contains(ignore_param)){ - // Remove the param to ignore - definition[key].get("properties").remove(ignore_param) - // If the param was required, change this - if (definition[key].has("required")) { - def cleaned_required = removeElement(definition[key].required, ignore_param) - definition[key].put("required", cleaned_required) - } - } - } - } - } - if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) { - raw_schema.get("properties").remove(ignore_param) - } - if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) { - def cleaned_required = removeElement(raw_schema.required, ignore_param) - raw_schema.put("required", cleaned_required) - } - } - return raw_schema - } - - // - // Clean and check parameters relative to Nextflow native classes - // - private static Map cleanParameters(params) { - def new_params = params.getClass().newInstance(params) - for (p in params) { - // remove anything evaluating to false - if (!p['value']) { - new_params.remove(p.key) - } - // Cast MemoryUnit to String - if (p['value'].getClass() == nextflow.util.MemoryUnit) { - new_params.replace(p.key, p['value'].toString()) - } - // Cast Duration to String - if (p['value'].getClass() == nextflow.util.Duration) { - new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) - } - // Cast LinkedHashMap to String - if (p['value'].getClass() == LinkedHashMap) { - new_params.replace(p.key, p['value'].toString()) - } - } - return new_params - } - - // - // This function tries to read a JSON params file - // - private static LinkedHashMap paramsLoad(String json_schema) { - def params_map = new LinkedHashMap() - try { - params_map = paramsRead(json_schema) - } catch (Exception e) { - println "Could not read parameters settings from JSON. $e" - params_map = new LinkedHashMap() - } - return params_map - } - - // - // Method to actually read in JSON file using Groovy. - // Group (as Key), values are all parameters - // - Parameter1 as Key, Description as Value - // - Parameter2 as Key, Description as Value - // .... - // Group - // - - private static LinkedHashMap paramsRead(String json_schema) throws Exception { - def json = new File(json_schema).text - def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') - def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') - /* Tree looks like this in nf-core schema - * definitions <- this is what the first get('definitions') gets us - group 1 - title - description - properties - parameter 1 - type - description - parameter 2 - type - description - group 2 - title - description - properties - parameter 1 - type - description - * properties <- parameters can also be ungrouped, outside of definitions - parameter 1 - type - description - */ - - // Grouped params - def params_map = new LinkedHashMap() - schema_definitions.each { key, val -> - def Map group = schema_definitions."$key".properties // Gets the property object of the group - def title = schema_definitions."$key".title - def sub_params = new LinkedHashMap() - group.each { innerkey, value -> - sub_params.put(innerkey, value) - } - params_map.put(title, sub_params) - } - - // Ungrouped params - def ungrouped_params = new LinkedHashMap() - schema_properties.each { innerkey, value -> - ungrouped_params.put(innerkey, value) - } - params_map.put("Other parameters", ungrouped_params) - - return params_map - } - - // - // Get maximum number of characters across all parameter names - // - private static Integer paramsMaxChars(params_map) { - Integer max_chars = 0 - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (param.size() > max_chars) { - max_chars = param.size() - } - } - } - return max_chars - } -} diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 25a0a74a..e248e4c3 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -3,6 +3,8 @@ // import org.yaml.snakeyaml.Yaml +import groovy.json.JsonOutput +import nextflow.extension.FilesEx class NfcoreTemplate { @@ -128,7 +130,7 @@ class NfcoreTemplate { def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -140,12 +142,14 @@ class NfcoreTemplate { try { if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } // Try to send HTML e-mail using sendmail + def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") + sendmail_tf.withWriter { w -> w << sendmail_html } [ 'sendmail', '-t' ].execute() << sendmail_html log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" } catch (all) { // Catch failures and try with plaintext def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + if ( mqc_report != null && mqc_report.size() <= max_multiqc_email_size.toBytes() ) { mail_cmd += [ '-A', mqc_report ] } mail_cmd.execute() << email_html @@ -154,14 +158,16 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") + def output_hf = new File(workflow.launchDir.toString(), ".pipeline_report.html") output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") + FilesEx.copyTo(output_hf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.html"); + output_hf.delete() + + // Write summary e-mail TXT to a file + def output_tf = new File(workflow.launchDir.toString(), ".pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } + FilesEx.copyTo(output_tf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.txt"); + output_tf.delete() } // @@ -222,6 +228,20 @@ class NfcoreTemplate { } } + // + // Dump pipeline parameters in a json file + // + public static void dump_parameters(workflow, params) { + def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + def filename = "params_${timestamp}.json" + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") + def jsonStr = JsonOutput.toJson(params) + temp_pf.text = JsonOutput.prettyPrint(jsonStr) + + FilesEx.copyTo(temp_pf.toPath(), "${params.outdir}/pipeline_info/params_${timestamp}.json") + temp_pf.delete() + } + // // Print pipeline summary on completion // diff --git a/lib/WorkflowCircdna.groovy b/lib/WorkflowCircdna.groovy index cc1128ef..06fd8bbf 100755 --- a/lib/WorkflowCircdna.groovy +++ b/lib/WorkflowCircdna.groovy @@ -11,6 +11,7 @@ class WorkflowCircdna { // Check and validate parameters // public static void initialise(params, log) { + genomeExistsError(params, log) @@ -44,17 +45,59 @@ class WorkflowCircdna { yaml_file_text += "data: |\n" yaml_file_text += "${summary_section}" return yaml_file_text - }// + } + + // + // Generate methods description for MultiQC + // + + public static String toolCitationText(params) { + + // TODO nf-core: Optionally add in-text citation tools to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text + } + + public static String toolBibliographyText(params) { + + // TODO Optionally add bibliographic entries to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text + } - public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = run_workflow.toMap() meta["manifest_map"] = run_workflow.manifest.toMap() + // Pipeline DOI meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + // TODO Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + //meta["tool_bibliography"] = toolBibliographyText(params) + + def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 7d26a0b0..fefbe0ec 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -19,40 +19,11 @@ class WorkflowMain { " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // - // Generate help string - // - public static String help(workflow, params) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Generate parameter summary log string - // - public static String paramsSummaryLog(workflow, params) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } // // Validate parameters and print summary to screen // - public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params) - System.exit(0) - } + public static void initialise(workflow, params, log, args) { // Print workflow version and exit on --version if (params.version) { @@ -61,16 +32,10 @@ class WorkflowMain { System.exit(0) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params) - - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) - } - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) + // Check that the profile doesn't contain spaces and doesn't end with a trailing comma + checkProfile(workflow.profile, args, log) // Check that conda channels are set-up correctly if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { @@ -96,4 +61,16 @@ class WorkflowMain { } return null } + + // + // Exit pipeline if --profile contains spaces + // + private static void checkProfile(profile, args, log) { + if (profile.endsWith(',')) { + Nextflow.error "Profile cannot end with a trailing comma. Please remove the comma from the end of the profile string.\nHint: A common mistake is to provide multiple values to `-profile` separated by spaces. Please use commas to separate profiles instead,e.g., `-profile docker,test`." + } + if (args[0]) { + log.warn "nf-core pipelines do not accept positional arguments. The positional argument `${args[0]}` has been detected.\n Hint: A common mistake is to provide multiple values to `-profile` separated by spaces. Please use commas to separate profiles instead,e.g., `-profile docker,test`." + } + } } diff --git a/lib/nfcore_external_java_deps.jar b/lib/nfcore_external_java_deps.jar deleted file mode 100644 index 805c8bb5..00000000 Binary files a/lib/nfcore_external_java_deps.jar and /dev/null differ diff --git a/main.nf b/main.nf index 62a59150..a2e35abe 100644 --- a/main.nf +++ b/main.nf @@ -17,6 +17,8 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// This is an example of how to use getGenomeAttribute() to fetch parameters +// from igenomes.config using `--genome` params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') /* @@ -25,7 +27,23 @@ params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -WorkflowMain.initialise(workflow, params, log) +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh38 -profile docker -outdir results --circle_identifier [circexplorer2,circle_map_realign,circle_map_repeats,circle_finder,unicycler,ampliconarchitect]" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + +WorkflowMain.initialise(workflow, params, log, args) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 6d809e90..13f158f1 100644 --- a/modules.json +++ b/modules.json @@ -7,77 +7,72 @@ "nf-core": { "bwa/index": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "cat/fastq": { "branch": "master", - "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "617777a807a1770f73deb38c80004bac06807eef", "installed_by": ["modules"] }, "minimap2/align": { "branch": "master", - "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", - "installed_by": ["modules"] - }, - "multiqc": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "picard/markduplicates": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447", "installed_by": ["bam_markduplicates_picard", "modules"] }, "samtools/faidx": { "branch": "master", - "git_sha": "bf8ff98531167f8245ba5c44ce7d781503ddf936", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules", "bam_stats_samtools"] + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/idxstats": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules", "bam_stats_samtools"] + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/index": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["bam_markduplicates_picard", "modules"] }, "samtools/sort": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "samtools/stats": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", - "installed_by": ["modules", "bam_stats_samtools"] + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/view": { "branch": "master", - "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "trimgalore": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] } } @@ -86,12 +81,12 @@ "nf-core": { "bam_markduplicates_picard": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "0c38be7e652a0b2f3a37681ee4c0dbdf85677647", "installed_by": ["subworkflows"] }, "bam_stats_samtools": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "0c38be7e652a0b2f3a37681ee4c0dbdf85677647", "installed_by": ["bam_markduplicates_picard", "subworkflows"] } } diff --git a/modules/local/ampliconarchitect/ampliconarchitect.nf b/modules/local/ampliconarchitect/ampliconarchitect.nf deleted file mode 100644 index 6a96c30b..00000000 --- a/modules/local/ampliconarchitect/ampliconarchitect.nf +++ /dev/null @@ -1,66 +0,0 @@ -process AMPLICONARCHITECT_AMPLICONARCHITECT { - tag "$meta.id" - label 'process_low' - - conda "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0': - 'quay.io/biocontainers/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0' }" - - input: - tuple val(meta), path(bam), path(bai), path(bed) - - output: - path "versions.yml" , emit: versions - tuple val(meta), path("*cycles.txt") , optional: true, emit: cycles - tuple val(meta), path("*graph.txt") , optional: true, emit: graph - tuple val(meta), path("*.out") , optional: true, emit: out - tuple val(meta), path("*_cnseg.txt") , optional: true, emit: cnseg - tuple val(meta), path("*.pdf") , optional: true, emit: pdf - tuple val(meta), path("*.png") , optional: true, emit: png - tuple val(meta), path("*_summary.txt") , optional: true, emit: summary - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin - REF=${params.reference_build} - - AmpliconArchitect.py $args \\ - --bam $bam --bed $bed --ref \$REF --out "${prefix}" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconArchitect: \$(echo \$(AmpliconArchitect.py --version 2>&1) | sed 's/AmpliconArchitect version //g') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin - REF=${params.reference_build} - - touch "${prefix}.logs.txt" - touch "${prefix}.cycles.txt" - touch "${prefix}.graph.txt" - touch "${prefix}.out" - touch "${prefix}_cnseg.txt" - touch "${prefix}.pdf" - touch "${prefix}.png" - touch "${prefix}_summary.txt" - - AmpliconArchitect.py --help - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconArchitect: \$(echo \$(AmpliconArchitect.py --version 2>&1) | sed 's/AmpliconArchitect version //g') - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/ampliconclassifier.nf b/modules/local/ampliconclassifier/ampliconclassifier.nf deleted file mode 100644 index 299f7168..00000000 --- a/modules/local/ampliconclassifier/ampliconclassifier.nf +++ /dev/null @@ -1,67 +0,0 @@ -process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { - tag "AA Amplicons" - label 'process_low' - - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" - - input: - path (input_file) - - output: - path ("*amplicon_classification_profiles.tsv" ), emit: class_tsv , optional: true - path ("*edge_classification_profiles.tsv" ), emit: edge_tsv , optional: true - path ("*gene_list.tsv" ) , emit: gene_list , optional: true - path ("*ecDNA_counts.tsv" ) , emit: ecDNA_counts , optional: true - path ("*.bed" ) , emit: bed , optional: true - path ("*annotated_cycles.txt" ) , emit: annotated_cycles, optional: true - path ("*class_radar.{png,pdf}" ) , emit: radar_plot , optional: true - path ("*feature_entropy.tsv" ) , emit: entropy , optional: true - path ("*feature_basic_properties.tsv" ) , emit: basic_properties, optional: true - path ("*classification_bed_files/*" ) , emit: bed_files , optional: true - path ("*annotated_cycles_files/" ) , emit: cycles_files , optional: true - path ("*.classifier_stdout.log" ) , emit: log , optional: true - path ("*" ) , emit: all , optional: true - path ("versions.yml" ) , emit: versions , optional: true - - script: - def args = task.ext.args ?: '' - - """ - REF=${params.reference_build} - export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin - - amplicon_classifier.py \\ - --ref \$REF \\ - $args \\ - --input $input_file \\ - > ampliconclassifier.classifier_stdout.log - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin - REF=${params.reference_build} - - touch "ampliconclassifier_amplicon_classification_profiles.tsv" - touch "ampliconclassifier_classifier_stdout.log" - - amplicon_classifier.py --help - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/ampliconsimilarity.nf b/modules/local/ampliconclassifier/ampliconsimilarity.nf deleted file mode 100644 index 612e97ff..00000000 --- a/modules/local/ampliconclassifier/ampliconsimilarity.nf +++ /dev/null @@ -1,54 +0,0 @@ -process AMPLICONCLASSIFIER_AMPLICONSIMILARITY { - tag "AA Amplicons" - label 'process_low' - - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" - - input: - path(input) - - output: - path("*_scores.tsv") , emit: scores - path("*") - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - REF=${params.reference_build} - export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin - - amplicon_similarity.py \\ - --ref \$REF \\ - $args \\ - --input $input - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - """ - REF=${params.reference_build} - export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin - - amplicon_similarity.py --help - touch "ampliconclassifier_similarity_scores.tsv" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/makeinput.nf b/modules/local/ampliconclassifier/makeinput.nf deleted file mode 100644 index c872589f..00000000 --- a/modules/local/ampliconclassifier/makeinput.nf +++ /dev/null @@ -1,43 +0,0 @@ -process AMPLICONCLASSIFIER_MAKEINPUT { - tag 'AA Amplicons' - label 'process_low' - - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" - - input: - path(graph) - path(cycles) - - output: - path "*.input" , emit: input - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - - """ - make_input.sh ./ ampliconclassifier - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - """ - touch "ampliconclassifier.input" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/makeresultstable.nf b/modules/local/ampliconclassifier/makeresultstable.nf deleted file mode 100644 index 03df04a1..00000000 --- a/modules/local/ampliconclassifier/makeresultstable.nf +++ /dev/null @@ -1,60 +0,0 @@ -process AMPLICONCLASSIFIER_MAKERESULTSTABLE { - tag 'AA Amplicons' - label 'process_low' - - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" - - input: - path (input_file) - path (class_file) - path (gene_list) - path (feature_entropy) - path (basic_properties) - path (bed_files) - - output: - path "*result_data.json" , emit: json - path "*result_table.tsv" , emit: tsv - path "index.html" , emit: html - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - # Create subdirectories in working directory - mkdir ampliconclassifier_classification_bed_files - mv $bed_files ampliconclassifier_classification_bed_files/ - - make_results_table.py \\ - $args \\ - --input $input_file \\ - --classification_file $class_file - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - - """ - make_results_table.py --help - - touch ampliconclasifier_result_data.json - touch ampliconclasifier_result_table.tsv - touch index.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index 90e3e657..699caba2 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,12 +1,41 @@ -FROM python:3.10 - -# Install Python packages -RUN pip install --no-cache-dir \ - pysam==0.21.0 \ - flask==2.3.2 \ - numpy==1.24.3 \ - matplotlib==3.7.1 \ - scipy==1.10.0 \ - intervaltree==3.1.0 \ - future==0.18.3 \ - Mosek==9.3.22 +# Start from the mambaorg/micromamba image +FROM mambaorg/micromamba:jammy + +# Label the image +LABEL authors="Daniel Schreyer " \ + description="Docker image containing procps and conda packages for ampliconsuite run" + +# Switch to root to install system packages +USER root + +# Install procps and other necessary packages +RUN apt-get update && \ + apt-get install -y procps && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +# Switch back to the default user +USER $NB_UID + + +# Install Conda packages with micromamba, including Python +RUN micromamba install --yes --name base -c bioconda -c conda-forge -c mosek \ + bioconda::ampliconsuite=1.2.1 \ + mosek::mosek=10.1.21 && \ + micromamba clean --all --yes + +# Assuming AmpliconSuite-pipeline.py is accessible in /opt/conda/bin +ENV PATH="/opt/conda//bin:${PATH}" + +# Append micromamba activation command to .bashrc +RUN echo "micromamba activate base" >> ~/.bashrc + + +# Create an entrypoint script +RUN echo '#!/bin/bash' > /entrypoint.sh && \ + echo 'eval "$(micromamba shell hook --shell bash)"' >> /entrypoint.sh && \ + echo 'micromamba activate base' >> /entrypoint.sh && \ + echo 'exec "$@"' >> /entrypoint.sh && \ + chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] +CMD ["/bin/bash", "-l"] diff --git a/modules/local/ampliconsuite/ampliconsuite.nf b/modules/local/ampliconsuite/ampliconsuite.nf new file mode 100644 index 00000000..1a2a56c1 --- /dev/null +++ b/modules/local/ampliconsuite/ampliconsuite.nf @@ -0,0 +1,91 @@ +process AMPLICONSUITE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container 'nf-core/prepareaa:1.0.5' + + input: + tuple val(meta), path(bam) + path(mosek_license_dir) + path(aa_data_repo) + + output: + path "*.bed" , emit: bed + path "*.cns" , emit: cns, optional: true + path "*.cnr.gz" , emit: cnr, optional: true + path "*.log" , emit: log + path "*run_metadata.json" , emit: run_metadata_json + path "*sample_metadata.json" , emit: sample_metadata_json + path "*timing_log.txt" , emit: timing_log + path "*.input" , emit: ac_input, optional: true + path "*logs.txt" , emit: logs, optional: true + path "*cycles.txt" , emit: cycles, optional: true + path "*graph.txt" , emit: graph, optional: true + path "*" + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def cngain = params.aa_cngain + def ref = params.reference_build + """ + export AA_DATA_REPO=\$(echo $aa_data_repo) + export MOSEKLM_LICENSE_FILE=\$(echo $mosek_license_dir) + # Define Variables AA_SRC and AC_SRC + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) + export AC_SRC=\$(dirname \$(which amplicon_classifier.py)) + REF=${params.reference_build} + + AmpliconSuite-pipeline.py \\ + $args \\ + -s $prefix \\ + -t $task.cpus \\ + --bam $bam \\ + --ref $ref \\ + --run_AA --run_AC \\ + $args + + # Move Files to base work directory + find ${prefix}_cnvkit_output/ -type f -print0 | xargs -0 mv -t ./ + find ${prefix}_AA_results/ -type f -print0 | xargs -0 mv -t ./ + find ${prefix}_classification/ -type f -print0 | xargs -0 mv -t ./ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def cngain = params.aa_cngain + def ref = params.reference_build + """ + export AA_DATA_REPO=\$(echo $aa_data_repo) + export MOSEKLM_LICENSE_FILE=\$(echo $mosek_license_dir) + # Define Variables AA_SRC and AC_SRC + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) + export AC_SRC=\$(dirname \$(which amplicon_classifier.py)) + REF=${params.reference_build} + + touch "${prefix}_CNV_SEEDS.bed" + touch "${prefix}.log" + touch "${prefix}.run_metadata.json" + touch "${prefix}.sample_metadata.json" + touch "${prefix}.timing_log.txt" + touch "${prefix}_summary.txt" + + AmpliconSuite-pipeline.py --help + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') + END_VERSIONS + """ +} diff --git a/modules/local/ampliconsuite/environment.yml b/modules/local/ampliconsuite/environment.yml new file mode 100644 index 00000000..f5cdc0b4 --- /dev/null +++ b/modules/local/ampliconsuite/environment.yml @@ -0,0 +1,9 @@ +name: ampliconsuite +channels: + - conda-forge + - bioconda + - mosek + - defaults +dependencies: + - bioconda::ampliconsuite=1.2.1 + - mosek::mosek=10.1.21 diff --git a/modules/local/ampliconsuite/prepareaa.nf b/modules/local/ampliconsuite/prepareaa.nf deleted file mode 100644 index 8484ede1..00000000 --- a/modules/local/ampliconsuite/prepareaa.nf +++ /dev/null @@ -1,71 +0,0 @@ -process PREPAREAA { - tag "$meta.id" - label 'process_low' - - conda "conda-forge::python=3.7 bioconda::pysam=0.16.0 anaconda::flask=2.2.2 conda-forge::numpy=1.21.6 conda-forge::matplotlib=3.2.2 anaconda::scipy=1.7.3 conda-forge::intervaltree=3.0.2 anaconda::future=0.18.2 mosek::mosek=9.0.88" - - input: - tuple val(meta), path(bam), path(cns) - - output: - tuple val(meta), path("*CNV_SEEDS.bed") , emit: bed - path "*.log" , emit: log - path "*run_metadata.json" , emit: run_metadata_json - path "*sample_metadata.json" , emit: sample_metadata_json - path "*timing_log.txt" , emit: timing_log - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - def ref = params.reference_build - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin - REF=${params.reference_build} - - PrepareAA.py \\ - $args \\ - -s $prefix \\ - -t $task.cpus \\ - --cnv_bed $cns \\ - --sorted_bam $bam \\ - --cngain $cngain \\ - --ref $ref - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - prepareaa: \$(echo \$(PrepareAA.py --version) | sed 's/^.*PrepareAA version //') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - def ref = params.reference_build - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - REF=${params.reference_build} - - touch "${prefix}_CNV_SEEDS.bed" - touch "${prefix}.log" - touch "${prefix}.run_metadata.json" - touch "${prefix}.sample_metadata.json" - touch "${prefix}.timing_log.txt" - touch "${prefix}_summary.txt" - - PrepareAA.py --help - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - prepareaa: \$(echo \$(PrepareAA.py --version) | sed 's/^.*PrepareAA version //') - END_VERSIONS - """ -} diff --git a/modules/local/amplified_intervals.nf b/modules/local/amplified_intervals.nf deleted file mode 100644 index 56da6cbd..00000000 --- a/modules/local/amplified_intervals.nf +++ /dev/null @@ -1,61 +0,0 @@ -process AMPLIFIED_INTERVALS { - tag "$meta.id" - label 'process_low' - - conda "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0': - 'quay.io/biocontainers/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0' }" - - input: - tuple val(meta), path(bed), path(bam), path(bai) - - output: - tuple val(meta), path("*CNV_SEEDS.bed"), emit: bed - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - def ref = params.reference_build - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - REF=${params.reference_build} - - amplified_intervals.py \\ - $args \\ - --bed $bed \\ - --out ${prefix}_AA_CNV_SEEDS \\ - --bam $bam \\ - --gain $cngain \\ - --ref $ref - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: echo \$(python --version 2<&1 | sed 's/Python //g') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - def ref = params.reference_build - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - REF=${params.reference_build} - - touch ${prefix}_AA_CNV_SEEDS.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: echo \$(python --version 2<&1 | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/cnvkit/segment.nf b/modules/local/cnvkit/segment.nf index b27c39b7..2ac1f16f 100644 --- a/modules/local/cnvkit/segment.nf +++ b/modules/local/cnvkit/segment.nf @@ -2,7 +2,7 @@ process CNVKIT_SEGMENT { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? 'bioconda::cnvkit=0.9.9' : null) + conda 'bioconda::cnvkit=0.9.9' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/cnvkit:0.9.9--pyhdfd78af_0' : 'quay.io/biocontainers/cnvkit:0.9.9--pyhdfd78af_0' }" diff --git a/modules/local/collect_seeds.nf b/modules/local/collect_seeds.nf deleted file mode 100644 index 7654659a..00000000 --- a/modules/local/collect_seeds.nf +++ /dev/null @@ -1,52 +0,0 @@ -process COLLECT_SEEDS { - tag "$meta.id" - label 'process_low' - - conda "conda-forge::python=3.9.5" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" - - input: - tuple val(meta), path(cns) - - output: - tuple val(meta), path("*.bed"), emit: bed - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - """ - collect_seeds.py \\ - --sample $prefix \\ - --cns $cns \\ - --cngain $cngain - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - REF=${params.reference_build} - - touch ${prefix}.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/multiqc/environment.yml b/modules/local/multiqc/environment.yml new file mode 100644 index 00000000..7625b752 --- /dev/null +++ b/modules/local/multiqc/environment.yml @@ -0,0 +1,7 @@ +name: multiqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.19 diff --git a/modules/local/multiqc.nf b/modules/local/multiqc/main.nf similarity index 93% rename from modules/local/multiqc.nf rename to modules/local/multiqc/main.nf index 024968d9..0af5e8ad 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda 'bioconda::multiqc=1.13a' + conda 'bioconda::multiqc=1.19' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13a--pyhdfd78af_1' : - 'quay.io/biocontainers/multiqc:1.13a--pyhdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path multiqc_config diff --git a/modules/local/multiqc/tests/main.nf.test b/modules/local/multiqc/tests/main.nf.test new file mode 100644 index 00000000..d0438eda --- /dev/null +++ b/modules/local/multiqc/tests/main.nf.test @@ -0,0 +1,83 @@ +nextflow_process { + + name "Test Process MULTIQC" + script "../main.nf" + process "MULTIQC" + tag "modules" + tag "modules_nfcore" + tag "multiqc" + + test("sarscov2 single-end [fastqc]") { + + when { + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 single-end [fastqc] [config]") { + + when { + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 single-end [fastqc] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.report.collect { file(it).getName() } + + process.out.data.collect { file(it).getName() } + + process.out.plots.collect { file(it).getName() } + + process.out.versions ).match() } + ) + } + + } +} diff --git a/modules/local/multiqc/tests/main.nf.test.snap b/modules/local/multiqc/tests/main.nf.test.snap new file mode 100644 index 00000000..d37e7304 --- /dev/null +++ b/modules/local/multiqc/tests/main.nf.test.snap @@ -0,0 +1,21 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" + ] + ], + "timestamp": "2024-01-09T23:02:49.911994" + }, + "sarscov2 single-end [fastqc] - stub": { + "content": [ + [ + "multiqc_report.html", + "multiqc_data", + "multiqc_plots", + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" + ] + ], + "timestamp": "2024-01-09T23:03:14.524346" + } +} \ No newline at end of file diff --git a/modules/local/multiqc/tests/tags.yml b/modules/local/multiqc/tests/tags.yml new file mode 100644 index 00000000..bea6c0d3 --- /dev/null +++ b/modules/local/multiqc/tests/tags.yml @@ -0,0 +1,2 @@ +multiqc: + - modules/nf-core/multiqc/** diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 755fb044..c8dd44b2 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -2,7 +2,7 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" label 'process_low' - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : 'biocontainers/python:3.8.3' }" diff --git a/modules/local/summarise_aa.nf b/modules/local/summarise_aa.nf deleted file mode 100644 index 739b42c1..00000000 --- a/modules/local/summarise_aa.nf +++ /dev/null @@ -1,47 +0,0 @@ -process SUMMARISE_AA { - tag "$meta.id" - label 'process_low' - - conda "pandas=1.1.5" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.1.5' : - 'quay.io/biocontainers/pandas:1.1.5' }" - - input: - tuple val(meta), path(summary_file), path(class_file) - - output: - tuple val(meta), path("*aa_results_summary.tsv"), emit: txt - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - summarise_aa.py \\ - --summary $summary_file \\ - --class_file $class_file \\ - --id ${meta.id} \\ - --output ${prefix}.aa_results_summary.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch "${prefix}.aa_results_summary.tsv" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/nf-core/bwa/index/environment.yml b/modules/nf-core/bwa/index/environment.yml new file mode 100644 index 00000000..5d3cb323 --- /dev/null +++ b/modules/nf-core/bwa/index/environment.yml @@ -0,0 +1,7 @@ +name: bwa_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bwa=0.7.17 diff --git a/modules/nf-core/bwa/index/main.nf b/modules/nf-core/bwa/index/main.nf index 8d2e56d9..24b5a2ea 100644 --- a/modules/nf-core/bwa/index/main.nf +++ b/modules/nf-core/bwa/index/main.nf @@ -2,7 +2,7 @@ process BWA_INDEX { tag "$fasta" label 'process_single' - conda "bioconda::bwa=0.7.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7' : 'biocontainers/bwa:0.7.17--hed695b0_7' }" @@ -18,13 +18,14 @@ process BWA_INDEX { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${fasta.baseName}" + def args = task.ext.args ?: '' """ mkdir bwa bwa \\ index \\ $args \\ - -p bwa/${fasta.baseName} \\ + -p bwa/${prefix} \\ $fasta cat <<-END_VERSIONS > versions.yml @@ -34,14 +35,15 @@ process BWA_INDEX { """ stub: + def prefix = task.ext.prefix ?: "${fasta.baseName}" """ mkdir bwa - touch bwa/genome.amb - touch bwa/genome.ann - touch bwa/genome.bwt - touch bwa/genome.pac - touch bwa/genome.sa + touch bwa/${prefix}.amb + touch bwa/${prefix}.ann + touch bwa/${prefix}.bwt + touch bwa/${prefix}.pac + touch bwa/${prefix}.sa cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/bwa/index/meta.yml b/modules/nf-core/bwa/index/meta.yml index 2c6cfcd7..730628d0 100644 --- a/modules/nf-core/bwa/index/meta.yml +++ b/modules/nf-core/bwa/index/meta.yml @@ -40,3 +40,6 @@ output: authors: - "@drpatelh" - "@maxulysse" +maintainers: + - "@drpatelh" + - "@maxulysse" diff --git a/modules/nf-core/bwa/index/tests/main.nf.test b/modules/nf-core/bwa/index/tests/main.nf.test new file mode 100644 index 00000000..5fc8d496 --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process BWA_INDEX" + tag "modules_nfcore" + tag "modules" + tag "bwa" + tag "bwa/index" + script "../main.nf" + process "BWA_INDEX" + + test("BWA index") { + + when { + process { + """ + input[0] = [ + [id: 'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bwa/index/tests/main.nf.test.snap b/modules/nf-core/bwa/index/tests/main.nf.test.snap new file mode 100644 index 00000000..e51ad5bf --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test.snap @@ -0,0 +1,43 @@ +{ + "BWA index": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "1": [ + "versions.yml:md5,0f20525da90e7489a7ebb02adca3265f" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "versions": [ + "versions.yml:md5,0f20525da90e7489a7ebb02adca3265f" + ] + } + ], + "timestamp": "2023-10-17T17:20:20.180927714" + } +} \ No newline at end of file diff --git a/modules/nf-core/bwa/index/tests/tags.yml b/modules/nf-core/bwa/index/tests/tags.yml new file mode 100644 index 00000000..28bb483c --- /dev/null +++ b/modules/nf-core/bwa/index/tests/tags.yml @@ -0,0 +1,2 @@ +bwa/index: + - modules/nf-core/bwa/index/** diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 00000000..bff93add --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf index 5021e6fc..3d963784 100644 --- a/modules/nf-core/cat/fastq/main.nf +++ b/modules/nf-core/cat/fastq/main.nf @@ -2,7 +2,7 @@ process CAT_FASTQ { tag "$meta.id" label 'process_single' - conda "conda-forge::sed=4.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml index 8a39e309..db4ac3c7 100644 --- a/modules/nf-core/cat/fastq/meta.yml +++ b/modules/nf-core/cat/fastq/meta.yml @@ -34,7 +34,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@joseespinosa" - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test new file mode 100644 index 00000000..f5f94182 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -0,0 +1,143 @@ +nextflow_process { + + name "Test Process CAT_FASTQ" + script "../main.nf" + process "CAT_FASTQ" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/fastq" + + test("test_cat_fastq_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_single_file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } +} diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap new file mode 100644 index 00000000..ec2342e5 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -0,0 +1,78 @@ +{ + "test_cat_fastq_single_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d" + ] + ] + ], + "timestamp": "2023-10-17T23:19:12.990284837" + }, + "test_cat_fastq_single_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66" + ] + ] + ], + "timestamp": "2023-10-17T23:19:31.554568147" + }, + "test_cat_fastq_single_end_single_file": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,e325ef7deb4023447a1f074e285761af" + ] + ] + ], + "timestamp": "2023-10-17T23:19:49.629360033" + }, + "test_cat_fastq_paired_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66", + "test_2.merged.fastq.gz:md5,fe9f266f43a6fc3dcab690a18419a56e" + ] + ] + ] + ], + "timestamp": "2023-10-17T23:19:40.711617539" + }, + "test_cat_fastq_paired_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d", + "test_2.merged.fastq.gz:md5,77c8e966e130d8c6b6ec9be52fcb2bda" + ] + ] + ] + ], + "timestamp": "2023-10-18T07:53:20.923560211" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml new file mode 100644 index 00000000..6ac43614 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/tags.yml @@ -0,0 +1,2 @@ +cat/fastq: + - modules/nf-core/cat/fastq/** diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 00000000..9b3272bc --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +name: custom_dumpsoftwareversions +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.19 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index ebc87273..f2187611 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index c32657de..5f15a5fd 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: @@ -16,7 +16,6 @@ input: type: file description: YML file containing software versions pattern: "*.yml" - output: - yml: type: file @@ -30,7 +29,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@drpatelh" - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 00000000..b1e1630b --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,43 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + file(process.out.mqc_yml[0]).readLines()[0..10], + file(process.out.yml[0]).readLines()[0..7] + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 00000000..5f59a936 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,76d454d92244589d32455833f7c1ba6d" + ], + [ + "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", + " \\n\\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n \\n \\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n\\n\\n \\n\\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\" + ], + [ + "CUSTOM_DUMPSOFTWAREVERSIONS:", + " python: 3.11.7", + " yaml: 5.4.1", + "TOOL1:", + " tool1: 0.11.9", + "TOOL2:", + " tool2: '1.9'", + "Workflow:" + ] + ], + "timestamp": "2024-01-09T23:01:18.710682" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 00000000..405aa24a --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml new file mode 100644 index 00000000..1787b38a --- /dev/null +++ b/modules/nf-core/fastqc/environment.yml @@ -0,0 +1,7 @@ +name: fastqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 07d5e433..9e19a74c 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -2,10 +2,10 @@ process FASTQC { tag "$meta.id" label 'process_medium' - conda "bioconda::fastqc=0.11.9" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'biocontainers/fastqc:0.11.9--0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" input: tuple val(meta), path(reads) @@ -29,11 +29,15 @@ process FASTQC { printf "%s %s\\n" $rename_to | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name done - fastqc $args --threads $task.cpus $renamed_files + + fastqc \\ + $args \\ + --threads $task.cpus \\ + $renamed_files cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) END_VERSIONS """ @@ -45,7 +49,7 @@ process FASTQC { cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) END_VERSIONS """ } diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml index 4da5bb5a..ee5507e0 100644 --- a/modules/nf-core/fastqc/meta.yml +++ b/modules/nf-core/fastqc/meta.yml @@ -50,3 +50,8 @@ authors: - "@grst" - "@ewels" - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test new file mode 100644 index 00000000..1f21c664 --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -0,0 +1,212 @@ +nextflow_process { + + name "Test Process FASTQC" + script "../main.nf" + process "FASTQC" + + tag "modules" + tag "modules_nfcore" + tag "fastqc" + + test("sarscov2 single-end [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. + // looks like this:
    Mon 2 Oct 2023
    test.gz
    + // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 paired-end [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 interleaved [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 paired-end [bam]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 multiple [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, + { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, + { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + { assert path(process.out.html[0][1][2]).text.contains("") }, + { assert path(process.out.html[0][1][3]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 custom_prefix") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'mysample', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 single-end [fastq] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.html.collect { file(it[1]).getName() } + + process.out.zip.collect { file(it[1]).getName() } + + process.out.versions ).match() } + ) + } + } + +} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap new file mode 100644 index 00000000..5d624bb8 --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -0,0 +1,20 @@ +{ + "sarscov2 single-end [fastq] - stub": { + "content": [ + [ + "test.html", + "test.zip", + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2024-01-17T18:40:57.254299" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2024-01-17T18:36:50.033627" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml new file mode 100644 index 00000000..7834294b --- /dev/null +++ b/modules/nf-core/fastqc/tests/tags.yml @@ -0,0 +1,2 @@ +fastqc: + - modules/nf-core/fastqc/** diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 00000000..de1f3811 --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,8 @@ +name: minimap2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::minimap2=2.24 + - bioconda::samtools=1.18 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf index 4da47c18..47cd420c 100644 --- a/modules/nf-core/minimap2/align/main.nf +++ b/modules/nf-core/minimap2/align/main.nf @@ -3,14 +3,14 @@ process MINIMAP2_ALIGN { label 'process_medium' // Note: the versions here need to match the versions used in the mulled container below and minimap2/index - conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : - 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' }" input: tuple val(meta), path(reads) - path reference + tuple val(meta2), path(reference) val bam_format val cigar_paf_format val cigar_bam diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml index 991b39a0..408522d5 100644 --- a/modules/nf-core/minimap2/align/meta.yml +++ b/modules/nf-core/minimap2/align/meta.yml @@ -25,6 +25,11 @@ input: description: | List of input FASTA or FASTQ files of size 1 and 2 for single-end and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] - reference: type: file description: | @@ -63,3 +68,8 @@ authors: - "@sofstam" - "@sateeshperi" - "@jfy133" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test b/modules/nf-core/minimap2/align/tests/main.nf.test new file mode 100644 index 00000000..b634468b --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test @@ -0,0 +1,145 @@ +nextflow_process { + + name "Test Process MINIMAP2_ALIGN" + script "../main.nf" + process "MINIMAP2_ALIGN" + + tag "modules" + tag "modules_nfcore" + tag "minimap2" + tag "minimap2/align" + + test("sarscov2 - fastq, fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test.snap b/modules/nf-core/minimap2/align/tests/main.nf.test.snap new file mode 100644 index 00000000..a39a1697 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test.snap @@ -0,0 +1,38 @@ +{ + "sarscov2 - fastq, fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:06.01315354" + }, + "sarscov2 - fastq, fasta, true, false, false - stub": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:24.487175659" + }, + "sarscov2 - [fastq1, fastq2], fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:12.50816279" + }, + "sarscov2 - fastq, [], true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:18.414974788" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/tags.yml b/modules/nf-core/minimap2/align/tests/tags.yml new file mode 100644 index 00000000..39dba374 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/tags.yml @@ -0,0 +1,2 @@ +minimap2/align: + - "modules/nf-core/minimap2/align/**" diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf deleted file mode 100644 index 1fc387be..00000000 --- a/modules/nf-core/multiqc/main.nf +++ /dev/null @@ -1,53 +0,0 @@ -process MULTIQC { - label 'process_single' - - conda "bioconda::multiqc=1.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" - - input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - """ - multiqc \\ - --force \\ - $args \\ - $config \\ - $extra_config \\ - . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - - stub: - """ - touch multiqc_data - touch multiqc_plots - touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index f93b5ee5..45a9bc35 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,5 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json -name: MultiQC +name: multiqc description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: - QC @@ -13,7 +12,6 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] - input: - multiqc_files: type: file @@ -31,7 +29,6 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" - output: - report: type: file @@ -54,3 +51,8 @@ authors: - "@bunop" - "@drpatelh" - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/picard/markduplicates/environment.yml b/modules/nf-core/picard/markduplicates/environment.yml new file mode 100644 index 00000000..58b795f5 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/environment.yml @@ -0,0 +1,7 @@ +name: picard_markduplicates +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::picard=3.1.1 diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf index facd7efb..80930cc4 100644 --- a/modules/nf-core/picard/markduplicates/main.nf +++ b/modules/nf-core/picard/markduplicates/main.nf @@ -2,10 +2,10 @@ process PICARD_MARKDUPLICATES { tag "$meta.id" label 'process_medium' - conda "bioconda::picard=3.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : - 'biocontainers/picard:3.0.0--hdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/picard:3.1.1--hdfd78af_0' : + 'biocontainers/picard:3.1.1--hdfd78af_0' }" input: tuple val(meta), path(bam) @@ -30,6 +30,9 @@ process PICARD_MARKDUPLICATES { } else { avail_mem = (task.memory.mega*0.8).intValue() } + + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ picard \\ -Xmx${avail_mem}M \\ @@ -48,6 +51,7 @@ process PICARD_MARKDUPLICATES { stub: def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ touch ${prefix}.bam touch ${prefix}.bam.bai diff --git a/modules/nf-core/picard/markduplicates/meta.yml b/modules/nf-core/picard/markduplicates/meta.yml index f7693d2f..1ab90c07 100644 --- a/modules/nf-core/picard/markduplicates/meta.yml +++ b/modules/nf-core/picard/markduplicates/meta.yml @@ -69,3 +69,7 @@ authors: - "@drpatelh" - "@projectoriented" - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@projectoriented" + - "@ramprasadn" diff --git a/modules/nf-core/picard/markduplicates/tests/main.nf.test b/modules/nf-core/picard/markduplicates/tests/main.nf.test new file mode 100644 index 00000000..b2bba094 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/tests/main.nf.test @@ -0,0 +1,111 @@ +nextflow_process { + + name "Test Process PICARD_MARKDUPLICATES" + script "../main.nf" + process "PICARD_MARKDUPLICATES" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "picard" + tag "picard/markduplicates" + + test("sarscov2 - bam, fasta, fai - sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + path(process.out.metrics.get(0).get(1)).readLines()[0..2], + process.out.versions + ).match() } + ) + } + } + + test("sarscov2 - bam, fasta, fai - unsorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + path(process.out.metrics.get(0).get(1)).readLines()[0..2], + process.out.versions + ).match() } + ) + } + } + + test("homo_sapiens - cram, fasta, fai") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + path(process.out.metrics.get(0).get(1)).readLines()[0..2], + process.out.versions + ).match() } + ) + } + } + +} diff --git a/modules/nf-core/picard/markduplicates/tests/main.nf.test.snap b/modules/nf-core/picard/markduplicates/tests/main.nf.test.snap new file mode 100644 index 00000000..cd788a4d --- /dev/null +++ b/modules/nf-core/picard/markduplicates/tests/main.nf.test.snap @@ -0,0 +1,44 @@ +{ + "sarscov2 - bam, fasta, fai - unsorted bam": { + "content": [ + "test.marked.bam", + [ + "## htsjdk.samtools.metrics.StringHeader", + "# MarkDuplicates --INPUT test.paired_end.bam --OUTPUT test.marked.bam --METRICS_FILE test.marked.MarkDuplicates.metrics.txt --ASSUME_SORT_ORDER queryname --REFERENCE_SEQUENCE genome.fasta --MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP 50000 --MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 8000 --SORTING_COLLECTION_SIZE_RATIO 0.25 --TAG_DUPLICATE_SET_MEMBERS false --REMOVE_SEQUENCING_DUPLICATES false --TAGGING_POLICY DontTag --CLEAR_DT true --DUPLEX_UMI false --FLOW_MODE false --FLOW_QUALITY_SUM_STRATEGY false --USE_END_IN_UNPAIRED_READS false --USE_UNPAIRED_CLIPPED_END false --UNPAIRED_END_UNCERTAINTY 0 --FLOW_SKIP_FIRST_N_FLOWS 0 --FLOW_Q_IS_KNOWN_END false --FLOW_EFFECTIVE_QUALITY_THRESHOLD 15 --ADD_PG_TAG_TO_READS true --REMOVE_DUPLICATES false --ASSUME_SORTED false --DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES --PROGRAM_RECORD_ID MarkDuplicates --PROGRAM_GROUP_NAME MarkDuplicates --READ_NAME_REGEX --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 --MAX_OPTICAL_DUPLICATE_SET_SIZE 300000 --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false", + "## htsjdk.samtools.metrics.StringHeader" + ], + [ + "versions.yml:md5,b699af51b1956f3810f8a7c066e0ab17" + ] + ], + "timestamp": "2023-11-28T10:50:37.735339781" + }, + "homo_sapiens - cram, fasta, fai": { + "content": [ + "test.marked.bam", + [ + "## htsjdk.samtools.metrics.StringHeader", + "# MarkDuplicates --INPUT test.paired_end.sorted.cram --OUTPUT test.marked.bam --METRICS_FILE test.marked.MarkDuplicates.metrics.txt --ASSUME_SORT_ORDER queryname --REFERENCE_SEQUENCE genome.fasta --MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP 50000 --MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 8000 --SORTING_COLLECTION_SIZE_RATIO 0.25 --TAG_DUPLICATE_SET_MEMBERS false --REMOVE_SEQUENCING_DUPLICATES false --TAGGING_POLICY DontTag --CLEAR_DT true --DUPLEX_UMI false --FLOW_MODE false --FLOW_QUALITY_SUM_STRATEGY false --USE_END_IN_UNPAIRED_READS false --USE_UNPAIRED_CLIPPED_END false --UNPAIRED_END_UNCERTAINTY 0 --FLOW_SKIP_FIRST_N_FLOWS 0 --FLOW_Q_IS_KNOWN_END false --FLOW_EFFECTIVE_QUALITY_THRESHOLD 15 --ADD_PG_TAG_TO_READS true --REMOVE_DUPLICATES false --ASSUME_SORTED false --DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES --PROGRAM_RECORD_ID MarkDuplicates --PROGRAM_GROUP_NAME MarkDuplicates --READ_NAME_REGEX --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 --MAX_OPTICAL_DUPLICATE_SET_SIZE 300000 --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false", + "## htsjdk.samtools.metrics.StringHeader" + ], + [ + "versions.yml:md5,b699af51b1956f3810f8a7c066e0ab17" + ] + ], + "timestamp": "2023-11-28T10:50:48.897954543" + }, + "sarscov2 - bam, fasta, fai - sorted bam": { + "content": [ + "test.marked.bam", + [ + "## htsjdk.samtools.metrics.StringHeader", + "# MarkDuplicates --INPUT test.paired_end.sorted.bam --OUTPUT test.marked.bam --METRICS_FILE test.marked.MarkDuplicates.metrics.txt --ASSUME_SORT_ORDER queryname --REFERENCE_SEQUENCE genome.fasta --MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP 50000 --MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 8000 --SORTING_COLLECTION_SIZE_RATIO 0.25 --TAG_DUPLICATE_SET_MEMBERS false --REMOVE_SEQUENCING_DUPLICATES false --TAGGING_POLICY DontTag --CLEAR_DT true --DUPLEX_UMI false --FLOW_MODE false --FLOW_QUALITY_SUM_STRATEGY false --USE_END_IN_UNPAIRED_READS false --USE_UNPAIRED_CLIPPED_END false --UNPAIRED_END_UNCERTAINTY 0 --FLOW_SKIP_FIRST_N_FLOWS 0 --FLOW_Q_IS_KNOWN_END false --FLOW_EFFECTIVE_QUALITY_THRESHOLD 15 --ADD_PG_TAG_TO_READS true --REMOVE_DUPLICATES false --ASSUME_SORTED false --DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES --PROGRAM_RECORD_ID MarkDuplicates --PROGRAM_GROUP_NAME MarkDuplicates --READ_NAME_REGEX --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 --MAX_OPTICAL_DUPLICATE_SET_SIZE 300000 --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false", + "## htsjdk.samtools.metrics.StringHeader" + ], + [ + "versions.yml:md5,b699af51b1956f3810f8a7c066e0ab17" + ] + ], + "timestamp": "2023-11-28T10:50:26.591387512" + } +} \ No newline at end of file diff --git a/modules/nf-core/picard/markduplicates/tests/nextflow.config b/modules/nf-core/picard/markduplicates/tests/nextflow.config new file mode 100644 index 00000000..02818dd6 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: PICARD_MARKDUPLICATES { + ext.prefix = { "${meta.id}.marked" } + ext.args = '--ASSUME_SORT_ORDER queryname' + } +} diff --git a/modules/nf-core/picard/markduplicates/tests/tags.yml b/modules/nf-core/picard/markduplicates/tests/tags.yml new file mode 100644 index 00000000..4f213d62 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/tests/tags.yml @@ -0,0 +1,2 @@ +picard/markduplicates: + - modules/nf-core/picard/markduplicates/** diff --git a/modules/nf-core/samtools/faidx/environment.yml b/modules/nf-core/samtools/faidx/environment.yml new file mode 100644 index 00000000..01ccbcc7 --- /dev/null +++ b/modules/nf-core/samtools/faidx/environment.yml @@ -0,0 +1,7 @@ +name: samtools_faidx +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf index c1e8ef3a..d3461627 100644 --- a/modules/nf-core/samtools/faidx/main.nf +++ b/modules/nf-core/samtools/faidx/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_FAIDX { tag "$fasta" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(fasta) @@ -35,8 +35,12 @@ process SAMTOOLS_FAIDX { """ stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' """ + ${fastacmd} touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml index 957b25e5..e189af28 100644 --- a/modules/nf-core/samtools/faidx/meta.yml +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -55,3 +55,7 @@ authors: - "@drpatelh" - "@ewels" - "@phue" +maintainers: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/samtools/flagstat/environment.yml b/modules/nf-core/samtools/flagstat/environment.yml new file mode 100644 index 00000000..5efae053 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/environment.yml @@ -0,0 +1,7 @@ +name: samtools_flagstat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf index eb7e72fc..f1893d7c 100644 --- a/modules/nf-core/samtools/flagstat/main.nf +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_FLAGSTAT { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(bam), path(bai) @@ -32,4 +32,15 @@ process SAMTOOLS_FLAGSTAT { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml index 954225df..97991358 100644 --- a/modules/nf-core/samtools/flagstat/meta.yml +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -47,3 +47,5 @@ output: pattern: "versions.yml" authors: - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test b/modules/nf-core/samtools/flagstat/tests/main.nf.test new file mode 100644 index 00000000..c8dd8dc9 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FLAGSTAT" + script "../main.nf" + process "SAMTOOLS_FLAGSTAT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/flagstat" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.flagstat).match() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap new file mode 100644 index 00000000..880019f2 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "BAM": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "timestamp": "2023-11-14T15:49:22.577133" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/flagstat/tests/tags.yml b/modules/nf-core/samtools/flagstat/tests/tags.yml new file mode 100644 index 00000000..2d2b7255 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/flagstat: + - modules/nf-core/samtools/flagstat/** diff --git a/modules/nf-core/samtools/idxstats/environment.yml b/modules/nf-core/samtools/idxstats/environment.yml new file mode 100644 index 00000000..2401db0f --- /dev/null +++ b/modules/nf-core/samtools/idxstats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_idxstats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf index a257d700..00d916bb 100644 --- a/modules/nf-core/samtools/idxstats/main.nf +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_IDXSTATS { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(bam), path(bai) @@ -33,4 +33,16 @@ process SAMTOOLS_IDXSTATS { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml index dda87e1e..344e92a3 100644 --- a/modules/nf-core/samtools/idxstats/meta.yml +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -48,3 +48,5 @@ output: pattern: "versions.yml" authors: - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test b/modules/nf-core/samtools/idxstats/tests/main.nf.test new file mode 100644 index 00000000..f6c92150 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process SAMTOOLS_IDXSTATS" + script "../main.nf" + process "SAMTOOLS_IDXSTATS" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/idxstats" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.idxstats).match() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap new file mode 100644 index 00000000..4c6c12bd --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "BAM": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "timestamp": "2023-11-14T15:52:19.875194" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/idxstats/tests/tags.yml b/modules/nf-core/samtools/idxstats/tests/tags.yml new file mode 100644 index 00000000..d3057c61 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/idxstats: + - modules/nf-core/samtools/idxstats/** diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml new file mode 100644 index 00000000..296ed99e --- /dev/null +++ b/modules/nf-core/samtools/index/environment.yml @@ -0,0 +1,7 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf index 0b20aa4b..8ad18fdc 100644 --- a/modules/nf-core/samtools/index/main.nf +++ b/modules/nf-core/samtools/index/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_INDEX { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input) diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml index 8bd2fa6f..01a4ee03 100644 --- a/modules/nf-core/samtools/index/meta.yml +++ b/modules/nf-core/samtools/index/meta.yml @@ -51,3 +51,7 @@ authors: - "@drpatelh" - "@ewels" - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/index/tests/csi.nextflow.config b/modules/nf-core/samtools/index/tests/csi.nextflow.config new file mode 100644 index 00000000..0ed260ef --- /dev/null +++ b/modules/nf-core/samtools/index/tests/csi.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_INDEX { + ext.args = '-c' + } + +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test b/modules/nf-core/samtools/index/tests/main.nf.test new file mode 100644 index 00000000..c76a9169 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test @@ -0,0 +1,87 @@ +nextflow_process { + + name "Test Process SAMTOOLS_INDEX" + script "../main.nf" + process "SAMTOOLS_INDEX" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/index" + + test("sarscov2 [BAI]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.bai).match("bai") }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } + + test("homo_sapiens [CRAI]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.crai).match("crai") }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } + + test("homo_sapiens [CSI]") { + + config "./csi.nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert path(process.out.csi.get(0).get(1)).exists() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test.snap b/modules/nf-core/samtools/index/tests/main.nf.test.snap new file mode 100644 index 00000000..b3baee7f --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test.snap @@ -0,0 +1,28 @@ +{ + "crai": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ] + ], + "timestamp": "2023-11-15T15:17:37.30801" + }, + "bai": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ] + ], + "timestamp": "2023-11-15T15:17:30.869234" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/index/tests/tags.yml b/modules/nf-core/samtools/index/tests/tags.yml new file mode 100644 index 00000000..e0f58a7a --- /dev/null +++ b/modules/nf-core/samtools/index/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/index: + - modules/nf-core/samtools/index/** diff --git a/modules/nf-core/samtools/sort/environment.yml b/modules/nf-core/samtools/sort/environment.yml new file mode 100644 index 00000000..cd50868c --- /dev/null +++ b/modules/nf-core/samtools/sort/environment.yml @@ -0,0 +1,7 @@ +name: samtools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf index 1e5181d4..4a666d42 100644 --- a/modules/nf-core/samtools/sort/main.nf +++ b/modules/nf-core/samtools/sort/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_SORT { tag "$meta.id" label 'process_medium' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(bam) @@ -21,13 +21,11 @@ process SAMTOOLS_SORT { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def sort_memory = (task.memory.mega/task.cpus).intValue() if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ samtools sort \\ $args \\ -@ $task.cpus \\ - -m ${sort_memory}M \\ -o ${prefix}.bam \\ -T $prefix \\ $bam diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml index 07328431..2200de72 100644 --- a/modules/nf-core/samtools/sort/meta.yml +++ b/modules/nf-core/samtools/sort/meta.yml @@ -46,3 +46,6 @@ output: authors: - "@drpatelh" - "@ewels" +maintainers: + - "@drpatelh" + - "@ewels" diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test b/modules/nf-core/samtools/sort/tests/main.nf.test new file mode 100644 index 00000000..abb80978 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test @@ -0,0 +1,73 @@ +nextflow_process { + + name "Test Process SAMTOOLS_SORT" + script "../main.nf" + process "SAMTOOLS_SORT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/sort" + + test("test_samtools_sort") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + [ + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_samtools_sort_stub") { + + config "./nextflow.config" + options "-stub-run" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + [ + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test.snap b/modules/nf-core/samtools/sort/tests/main.nf.test.snap new file mode 100644 index 00000000..ff722259 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test.snap @@ -0,0 +1,48 @@ +{ + "test_samtools_sort": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,ea6a0fef94eb534e901f107a05a33a06" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,33b6a403dc19a0d28e4219ccab0a1d80" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,ea6a0fef94eb534e901f107a05a33a06" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,33b6a403dc19a0d28e4219ccab0a1d80" + ] + } + ], + "timestamp": "2023-12-04T11:11:22.005628301" + }, + "test_samtools_sort_stub": { + "content": [ + "test.sorted.bam", + [ + "versions.yml:md5,33b6a403dc19a0d28e4219ccab0a1d80" + ] + ], + "timestamp": "2023-12-04T17:47:22.314445935" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/sort/tests/nextflow.config b/modules/nf-core/samtools/sort/tests/nextflow.config new file mode 100644 index 00000000..d0f35086 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + } + +} diff --git a/modules/nf-core/samtools/sort/tests/tags.yml b/modules/nf-core/samtools/sort/tests/tags.yml new file mode 100644 index 00000000..cd63ea20 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/tags.yml @@ -0,0 +1,3 @@ +samtools/sort: + - modules/nf-core/samtools/sort/** + - tests/modules/nf-core/samtools/sort/** diff --git a/modules/nf-core/samtools/stats/environment.yml b/modules/nf-core/samtools/stats/environment.yml new file mode 100644 index 00000000..b89ce647 --- /dev/null +++ b/modules/nf-core/samtools/stats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf index 4a2607de..7539140a 100644 --- a/modules/nf-core/samtools/stats/main.nf +++ b/modules/nf-core/samtools/stats/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_STATS { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input), path(input_index) diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml index 90e6345f..735ff812 100644 --- a/modules/nf-core/samtools/stats/meta.yml +++ b/modules/nf-core/samtools/stats/meta.yml @@ -57,3 +57,7 @@ authors: - "@drpatelh" - "@FriederikeHanssen" - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test b/modules/nf-core/samtools/stats/tests/main.nf.test new file mode 100644 index 00000000..20c3efe1 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test @@ -0,0 +1,78 @@ +nextflow_process { + + name "Test Process SAMTOOLS_STATS" + script "../main.nf" + process "SAMTOOLS_STATS" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/stats" + + test("SAMTOOLS STATS Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here. + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + + ] + input[1] = [[],[]] + """ + + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + test("SAMTOOLS CRAM Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram_crai'], checkIfExists: true) + + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + + + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + +} diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test.snap b/modules/nf-core/samtools/stats/tests/main.nf.test.snap new file mode 100644 index 00000000..025c83a5 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "SAMTOOLS STATS Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,045a48208b1c6f5b8af4347fe31f4def" + ] + ], + "1": [ + "versions.yml:md5,650a365c6635001436008350ae83337c" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,045a48208b1c6f5b8af4347fe31f4def" + ] + ], + "versions": [ + "versions.yml:md5,650a365c6635001436008350ae83337c" + ] + } + ], + "timestamp": "2023-12-04T11:07:28.26821485" + }, + "SAMTOOLS CRAM Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,dfbfa130d4a6925ddd1931dcd8354a43" + ] + ], + "1": [ + "versions.yml:md5,650a365c6635001436008350ae83337c" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,dfbfa130d4a6925ddd1931dcd8354a43" + ] + ], + "versions": [ + "versions.yml:md5,650a365c6635001436008350ae83337c" + ] + } + ], + "timestamp": "2023-12-04T11:07:50.356233402" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/stats/tests/tags.yml b/modules/nf-core/samtools/stats/tests/tags.yml new file mode 100644 index 00000000..7c28e30f --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/stats: + - modules/nf-core/samtools/stats/** diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 00000000..99aa69d0 --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,7 @@ +name: samtools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf index cb91facf..0b5a2912 100644 --- a/modules/nf-core/samtools/view/main.nf +++ b/modules/nf-core/samtools/view/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_VIEW { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input), path(index) @@ -53,10 +53,19 @@ process SAMTOOLS_VIEW { """ stub: + def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + def index = args.contains("--write-index") ? "touch ${prefix}.csi" : "" + """ - touch ${prefix}.bam - touch ${prefix}.cram + touch ${prefix}.${file_type} + ${index} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml index 3b05450b..3dadafae 100644 --- a/modules/nf-core/samtools/view/meta.yml +++ b/modules/nf-core/samtools/view/meta.yml @@ -82,3 +82,8 @@ authors: - "@joseespinosa" - "@FriederikeHanssen" - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/view/tests/bam.config b/modules/nf-core/samtools/view/tests/bam.config new file mode 100644 index 00000000..c10d1081 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/bam.config @@ -0,0 +1,3 @@ +process { + ext.args = "--output-fmt bam" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/bam_index.config b/modules/nf-core/samtools/view/tests/bam_index.config new file mode 100644 index 00000000..771ae033 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/bam_index.config @@ -0,0 +1,3 @@ +process { + ext.args = "--output-fmt bam --write-index" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/main.nf.test b/modules/nf-core/samtools/view/tests/main.nf.test new file mode 100644 index 00000000..89ed3555 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/main.nf.test @@ -0,0 +1,231 @@ +nextflow_process { + + name "Test Process SAMTOOLS_VIEW" + script "../main.nf" + process "SAMTOOLS_VIEW" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/view" + + test("sarscov2 - [bam, []], [], []") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), + [] + ] + input[1] = [[],[]] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, crai], fasta, []") { + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram_crai'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.cram[0][1]).name, + process.out.bam, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, [] - bam output") { + + config "./bam.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, [] - bam & index output") { + + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, qname - bam & index output") { + + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = Channel.of("testN:2817", "testN:2814").collectFile(name: "readnames.list", newLine: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [bam, []], [], [] - stub") { + + options "-stub" + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), + [] + ] + input[1] = [[],[]] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/samtools/view/tests/main.nf.test.snap b/modules/nf-core/samtools/view/tests/main.nf.test.snap new file mode 100644 index 00000000..83427491 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/main.nf.test.snap @@ -0,0 +1,140 @@ +{ + "homo_sapiens - [cram, []], fasta, [] - bam output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:17.563069206" + }, + "sarscov2 - [bam, []], [], []": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:03.206994564" + }, + "homo_sapiens - [cram, []], fasta, qname - bam & index output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.bam.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:39.165289759" + }, + "homo_sapiens - [cram, []], fasta, [] - bam & index output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.bam.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:32.25731224" + }, + "sarscov2 - [bam, []], [], [] - stub": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:45.81037195" + }, + "homo_sapiens - [cram, crai], fasta, []": { + "content": [ + "test.cram", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:10.730011823" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/tags.yml b/modules/nf-core/samtools/view/tests/tags.yml new file mode 100644 index 00000000..4fdf1dd1 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/view: + - "modules/nf-core/samtools/view/**" diff --git a/modules/nf-core/trimgalore/environment.yml b/modules/nf-core/trimgalore/environment.yml new file mode 100644 index 00000000..6cd0f51b --- /dev/null +++ b/modules/nf-core/trimgalore/environment.yml @@ -0,0 +1,7 @@ +name: trimgalore +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::trim-galore=0.6.7 diff --git a/modules/nf-core/trimgalore/main.nf b/modules/nf-core/trimgalore/main.nf index dcb77ae7..24ead871 100644 --- a/modules/nf-core/trimgalore/main.nf +++ b/modules/nf-core/trimgalore/main.nf @@ -2,7 +2,7 @@ process TRIMGALORE { tag "$meta.id" label 'process_high' - conda "bioconda::trim-galore=0.6.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/trim-galore:0.6.7--hdfd78af_0' : 'biocontainers/trim-galore:0.6.7--hdfd78af_0' }" diff --git a/modules/nf-core/trimgalore/meta.yml b/modules/nf-core/trimgalore/meta.yml index f84c4d77..e649088c 100644 --- a/modules/nf-core/trimgalore/meta.yml +++ b/modules/nf-core/trimgalore/meta.yml @@ -62,3 +62,7 @@ authors: - "@drpatelh" - "@ewels" - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/trimgalore/tests/main.nf.test b/modules/nf-core/trimgalore/tests/main.nf.test new file mode 100644 index 00000000..bc6812cc --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test @@ -0,0 +1,105 @@ +nextflow_process { + + name "Test Process TRIMGALORE" + script "../main.nf" + process "TRIMGALORE" + tag "modules" + tag "modules_nfcore" + tag "trimgalore" + + test("test_trimgalore_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1)).getText().contains(report1_line) } + } + } + ) + } + } + + test("test_trimgalore_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1).get(0)).getText().contains(report1_line) } + } + }, + { report2_lines.each { report2_line -> + { assert path(process.out.log.get(0).get(1).get(1)).getText().contains(report2_line) } + } + } + ) + } + } +} diff --git a/modules/nf-core/trimgalore/tests/main.nf.test.snap b/modules/nf-core/trimgalore/tests/main.nf.test.snap new file mode 100644 index 00000000..84feacca --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test.snap @@ -0,0 +1,148 @@ +{ + "test_trimgalore_single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_trimmed.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastq.gz_trimming_report.txt:md5,a1ab3958205f1ddf48af623242b5b429" + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "html": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastq.gz_trimming_report.txt:md5,a1ab3958205f1ddf48af623242b5b429" + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_trimmed.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4" + ] + ], + "unpaired": [ + + ], + "versions": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "zip": [ + + ] + } + ], + "timestamp": "2023-10-17T15:24:57.782141441" + }, + "test_trimgalore_paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1_val_1.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4", + "test_2_val_2.fq.gz:md5,f3d61189e6d10202da7b8686f1dbb71b" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.gz_trimming_report.txt:md5,315d40465412f9909bbaabf52269274d", + "test_2.fastq.gz_trimming_report.txt:md5,34436303da1c78811103427a2fb57f7b" + ] + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "html": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.gz_trimming_report.txt:md5,315d40465412f9909bbaabf52269274d", + "test_2.fastq.gz_trimming_report.txt:md5,34436303da1c78811103427a2fb57f7b" + ] + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1_val_1.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4", + "test_2_val_2.fq.gz:md5,f3d61189e6d10202da7b8686f1dbb71b" + ] + ] + ], + "unpaired": [ + + ], + "versions": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "zip": [ + + ] + } + ], + "timestamp": "2023-10-17T15:25:08.513589909" + } +} \ No newline at end of file diff --git a/modules/nf-core/trimgalore/tests/tags.yml b/modules/nf-core/trimgalore/tests/tags.yml new file mode 100644 index 00000000..e9937691 --- /dev/null +++ b/modules/nf-core/trimgalore/tests/tags.yml @@ -0,0 +1,2 @@ +trimgalore: + - modules/nf-core/trimgalore/** diff --git a/nextflow.config b/nextflow.config index 7eed736f..1d5f672a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,12 +10,12 @@ params { // Input options input = null - input_format = "FASTQ" + input_format = null // References genome = null - igenomes_base = 's3://ngi-igenomes/igenomes' + igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false // BWA Reference @@ -27,7 +27,7 @@ params { save_sorted_bam = false // Circular DNA identification options - circle_identifier = 'circexplorer2' + circle_identifier = null // FASTQC options skip_qc = false @@ -69,7 +69,6 @@ params { // Boilerplate options outdir = null - tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -78,24 +77,28 @@ params { hook_url = null help = false version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes' // Config options + config_profile_name = null + config_profile_description = null custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_description = null config_profile_contact = null config_profile_url = null - config_profile_name = null - // Max resource options // Defaults only, expecting to be overwritten max_memory = '128.GB' max_cpus = 16 max_time = '240.h' + + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = true + } // Load base.config by default for all pipelines @@ -109,19 +112,18 @@ try { } // Load nf-core/circdna custom profiles from different institutions. -// Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! +// Warning: Uncomment only if a pipeline-specific institutional config already exists on nf-core/configs! // try { // includeConfig "${params.custom_config_base}/pipeline/circdna.config" // } catch (Exception e) { // System.err.println("WARNING: Could not load nf-core/config/circdna profiles: ${params.custom_config_base}/pipeline/circdna.config") // } - - profiles { debug { dumpHashes = true process.beforeScript = 'echo $HOSTNAME' - cleanup = false + cleanup = false + nextflow.enable.configProcessNamesValidation = true } conda { conda.enabled = true @@ -130,6 +132,7 @@ profiles { podman.enabled = false shifter.enabled = false charliecloud.enabled = false + channels = ['conda-forge', 'bioconda', 'defaults'] apptainer.enabled = false } mamba { @@ -144,17 +147,16 @@ profiles { } docker { docker.enabled = true - docker.registry = 'quay.io' - docker.userEmulation = true conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' } arm { - docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { singularity.enabled = true @@ -168,7 +170,6 @@ profiles { } podman { podman.enabled = true - podman.registry = 'quay.io' conda.enabled = false docker.enabled = false singularity.enabled = false @@ -196,6 +197,7 @@ profiles { } apptainer { apptainer.enabled = true + apptainer.autoMounts = true conda.enabled = false docker.enabled = false singularity.enabled = false @@ -205,14 +207,27 @@ profiles { } gitpod { executor.name = 'local' - executor.cpus = 16 - executor.memory = 60.GB + executor.cpus = 4 + executor.memory = 8.GB } test { includeConfig 'conf/test.config' } test_AA { includeConfig 'conf/test_AA.config' } + test_AA_local { includeConfig 'conf/test_AA_local.config' } test_full { includeConfig 'conf/test_full.config' } } +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' + +// Nextflow plugins +plugins { + id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} // Load igenomes.config if required if (!params.igenomes_ignore) { @@ -220,8 +235,6 @@ if (!params.igenomes_ignore) { } else { params.genomes = [:] } - - // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -236,23 +249,30 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] +// Disable process selector warnings by default. Use debug profile to enable warnings. +nextflow.enable.configProcessNamesValidation = false + def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } +// wave { +// enabled = true +// strategy = ['container', 'container'] +// } manifest { name = 'nf-core/circdna' @@ -260,8 +280,8 @@ manifest { homePage = 'https://github.com/nf-core/circdna' description = """Pipeline for the identification of circular DNAs""" mainScript = 'main.nf' - nextflowVersion = '!>=22.10.1' - version = '1.0.4' + nextflowVersion = '!>=23.04.0' + version = '1.1' doi = '10.5281/zenodo.7712010' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 83f2899a..08b8d4dd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,9 +15,9 @@ "input": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with either 2 [BAM] or 3 [FASTQ] columns, and a header row. See [usage docs](https://nf-co.re/circdna/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" @@ -25,8 +25,7 @@ "input_format": { "type": "string", "description": "Specify input format. Default *FASTQ*. Options 'FASTQ' or 'BAM'.", - "default": "FASTQ", - "help_text": "Define which input file formats are used in the pipeline run. Use either `--input_format FASTQ` or `--input_format BAM`.", + "help_text": "Define which input file formats are used in the pipeline run. Use either `--input_format 'FASTQ'` or `--input_format 'BAM'`.", "fa_icon": "fas fa-pen" }, "bam_sorted": { @@ -61,6 +60,21 @@ } } }, + "circdna_identifier_options": { + "title": "Circular DNA identifier options", + "type": "object", + "fa_icon": "fas fa-circle-notch", + "description": "Options to adjust inital circular DNA identifier", + "required": ["circle_identifier"], + "properties": { + "circle_identifier": { + "type": "string", + "description": "Specifies the circular DNA identification algorithm to use - available 'circle_map_realign', 'circle_map_repeats', 'circle_finder', 'circexplorer2', and 'ampliconarchitect'. Multiple circle_identifier's can be specified with a comma-separated string. E.g. `--circle_identifier 'circle_map_realign,unicycler'`.", + "help_text": "Specify the circle_identifier branch used. Multiple circle_identifier's can be specified with a comma-separated string. E.g. `--circle_identifier 'circle_map_realign,unicycler'`.", + "fa_icon": "fas fa-circle-notch" + } + } + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -76,20 +90,13 @@ "fasta": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", "fa_icon": "far fa-file-code" }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true - }, "igenomes_ignore": { "type": "boolean", "description": "Do not load the iGenomes reference config.", @@ -106,10 +113,9 @@ }, "bwa_index": { "type": "string", - "format": "file-path", + "format": "directory-path", "mimetype": "text/plain", - "pattern": "^\\S+\\.\\{amb,ann,bwt,pac,sa\\}$", - "description": "Path to BWA Index genome file.", + "description": "Path to the directory containg the BWA index files.", "help_text": "This parameter is *optional*. If you don't have a BWA index available this will be generated for you automatically.", "fa_icon": "far fa-file-code" } @@ -213,21 +219,6 @@ } } }, - "circdna_identifier_options": { - "title": "Circular DNA identifier options", - "type": "object", - "fa_icon": "fas fa-circle-notch", - "description": "Options to adjust inital circular DNA identifier", - "properties": { - "circle_identifier": { - "type": "string", - "description": "Specifies the circular DNA identification algorithm to use - available 'circle_map_realign', 'circle_map_repeats', 'circle_finder', 'circexplorer2', and 'ampliconarchitect'.", - "help_text": "Specify the circle_identifier branch used. Multiple circle_identifier's can be specified with a comma-separated string. E.g. `--circle_identifier 'circle_map_realign,unicycler'`.", - "fa_icon": "fas fa-circle-notch", - "default": "circle_map_realign" - } - } - }, "circle_map_options": { "title": "circle-map options", "type": "object", @@ -309,8 +300,7 @@ "mimetype": "text/plain", "fa_icon": "fas fa-book", "description": "When running AmpliconArchitect, specify reference build ['GRCh37', 'GRCh38', 'mm10']. This is *mandatory* to match fasta and AA reference build!", - "help_text": "Specify the reference genome build used for alignment of the WGS reads.", - "default": "GRCh38" + "help_text": "Specify the reference genome build used for alignment of the WGS reads." }, "cnvkit_cnn": { "type": "string", @@ -398,7 +388,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -469,6 +459,7 @@ }, "multiqc_config": { "type": "string", + "format": "file-path", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true @@ -484,13 +475,6 @@ "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -498,12 +482,26 @@ "fa_icon": "fas fa-check-square", "hidden": true }, - "show_hidden_params": { + "validationShowHiddenParams": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } } diff --git a/pyproject.toml b/pyproject.toml index 0d62beb6..7d08e1c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,13 @@ -# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. +# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. # Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. -[tool.black] +[tool.ruff] line-length = 120 -target_version = ["py37", "py38", "py39", "py310"] +target-version = "py38" +select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] +cache-dir = "~/.cache/ruff" -[tool.isort] -profile = "black" -known_first_party = ["nf_core"] -multi_line_output = 3 +[tool.ruff.isort] +known-first-party = ["nf_core"] + +[tool.ruff.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/subworkflows/nf-core/bam_markduplicates_picard/main.nf b/subworkflows/nf-core/bam_markduplicates_picard/main.nf index 6e3df332..de8130fb 100644 --- a/subworkflows/nf-core/bam_markduplicates_picard/main.nf +++ b/subworkflows/nf-core/bam_markduplicates_picard/main.nf @@ -26,13 +26,9 @@ workflow BAM_MARKDUPLICATES_PICARD { ch_bam_bai = PICARD_MARKDUPLICATES.out.bam .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) - .map { - meta, bam, bai, csi -> - if (bai) { - [ meta, bam, bai ] - } else { - [ meta, bam, csi ] - } + .map{meta, bam, bai, csi -> + if (bai) [ meta, bam, bai ] + else [ meta, bam, csi ] } BAM_STATS_SAMTOOLS ( ch_bam_bai, ch_fasta ) diff --git a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml index d5e71609..fe63068e 100644 --- a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml +++ b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml @@ -6,14 +6,13 @@ keywords: - bam - sam - cram - -modules: +components: - picard/markduplicates - samtools/index - samtools/stats - samtools/idxstats - samtools/flagstat - + - bam_stats_samtools input: - ch_bam: description: | @@ -59,3 +58,6 @@ output: authors: - "@dmarron" - "@drpatelh" +maintainers: + - "@dmarron" + - "@drpatelh" diff --git a/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test b/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test new file mode 100644 index 00000000..d8d24290 --- /dev/null +++ b/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test @@ -0,0 +1,93 @@ +nextflow_workflow { + + name "Test Workflow BAM_MARKDUPLICATES_PICARD" + script "../main.nf" + workflow "BAM_MARKDUPLICATES_PICARD" + + tag "picard" + tag "picard/markduplicates" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "bam_markduplicates_picard" + tag "subworkflows/bam_markduplicates_picard" + tag "subworkflows/bam_stats_samtools" + tag "bam_stats_samtools" + tag "samtools" + tag "samtools/flagstat" + tag "samtools/idxstats" + tag "samtools/index" + tag "samtools/stats" + + test("sarscov2 - bam") { + + when { + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end: false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + path(workflow.out.bam[0][1]), + path(workflow.out.bai[0][1]), + path(workflow.out.flagstat[0][1]), + path(workflow.out.idxstats[0][1]), + path(workflow.out.stats[0][1]), + ).match("sarscov2 - bam") }, + { assert path(workflow.out.metrics.get(0).get(1)).getText().contains("97") } + ) + } + } + + test("homo_sapiens - cram") { + + when { + workflow { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + input[2] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + path(workflow.out.bam[0][1]), + path(workflow.out.bai[0][1]), + path(workflow.out.flagstat[0][1]), + path(workflow.out.idxstats[0][1]), + path(workflow.out.stats[0][1]), + ).match("homo_sapiens - cram") }, + { assert path(workflow.out.metrics.get(0).get(1)).getText().contains("0.999986") } + ) + } + } + +} diff --git a/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test.snap b/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test.snap new file mode 100644 index 00000000..a208d101 --- /dev/null +++ b/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test.snap @@ -0,0 +1,22 @@ +{ + "homo_sapiens - cram": { + "content": [ + "test.bam:md5,6641dc05efa8384a061f378d86d922cd", + "test.bam.bai:md5,c41c60d8a94adebe53b6df80b6e90d38", + "test.flagstat:md5,93b0ef463df947ede1f42ff60396c34d", + "test.idxstats:md5,e179601fa7b8ebce81ac3765206f6c15", + "test.stats:md5,0035ac8900d85e9a790f4c1f48b76947" + ], + "timestamp": "2023-12-05T17:45:12.484869" + }, + "sarscov2 - bam": { + "content": [ + "test.bam:md5,3091fe6ba1b7530f382fe40b9fd8f45b", + "test.bam.bai:md5,4d3ae8d013444b55e17aa0149a2ab404", + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783", + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2", + "test.stats:md5,e32e7e49dce1fbe327a89e0fb7bc01b1" + ], + "timestamp": "2023-12-05T17:43:58.582652" + } +} diff --git a/subworkflows/nf-core/bam_markduplicates_picard/tests/tags.yml b/subworkflows/nf-core/bam_markduplicates_picard/tests/tags.yml new file mode 100644 index 00000000..10b85270 --- /dev/null +++ b/subworkflows/nf-core/bam_markduplicates_picard/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/bam_markduplicates_picard: + - subworkflows/nf-core/bam_markduplicates_picard/** diff --git a/subworkflows/nf-core/bam_stats_samtools/meta.yml b/subworkflows/nf-core/bam_stats_samtools/meta.yml index b05086bc..809bf736 100644 --- a/subworkflows/nf-core/bam_stats_samtools/meta.yml +++ b/subworkflows/nf-core/bam_stats_samtools/meta.yml @@ -7,7 +7,7 @@ keywords: - bam - sam - cram -modules: +components: - samtools/stats - samtools/idxstats - samtools/flagstat @@ -39,3 +39,5 @@ output: Structure: [ path(versions.yml) ] authors: - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test new file mode 100644 index 00000000..c8b21f28 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test @@ -0,0 +1,108 @@ +nextflow_workflow { + + name "Test Workflow BAM_STATS_SAMTOOLS" + script "../main.nf" + workflow "BAM_STATS_SAMTOOLS" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "bam_stats_samtools" + tag "subworkflows/bam_stats_samtools" + tag "samtools" + tag "samtools/flagstat" + tag "samtools/idxstats" + tag "samtools/stats" + + test("test_bam_stats_samtools_single_end") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_single_end_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_single_end_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_single_end_idxstats") } + ) + } + } + + test("test_bam_stats_samtools_paired_end") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_paired_end_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_paired_end_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_paired_end_idxstats") } + ) + } + } + + test("test_bam_stats_samtools_paired_end_cram") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram.crai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_paired_end_cram_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_paired_end_cram_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_paired_end_cram_idxstats") } + ) + } + } + +} diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap new file mode 100644 index 00000000..8bf0d379 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap @@ -0,0 +1,128 @@ +{ + "test_bam_stats_samtools_paired_end_cram_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,a53f3d26e2e9851f7d528442bbfe9781" + ] + ] + ], + "timestamp": "2023-11-06T09:31:26.194017574" + }, + "test_bam_stats_samtools_paired_end_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.stats:md5,49e2b43344ff92bc4c02463a58f7ba4a" + ] + ] + ], + "timestamp": "2024-01-18T17:17:27.704335" + }, + "test_bam_stats_samtools_paired_end_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "timestamp": "2024-01-18T17:17:27.717482" + }, + "test_bam_stats_samtools_single_end_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.flagstat:md5,2191911d72575a2358b08b1df64ccb53" + ] + ] + ], + "timestamp": "2023-11-06T09:26:10.340046381" + }, + "test_bam_stats_samtools_paired_end_cram_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,e179601fa7b8ebce81ac3765206f6c15" + ] + ] + ], + "timestamp": "2023-11-06T09:31:26.207052003" + }, + "test_bam_stats_samtools_single_end_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.stats:md5,5a6667d97806e5002731e9cf23674fad" + ] + ] + ], + "timestamp": "2023-12-04T11:07:06.676820877" + }, + "test_bam_stats_samtools_paired_end_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "timestamp": "2024-01-18T17:17:27.726719" + }, + "test_bam_stats_samtools_single_end_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.idxstats:md5,613e048487662c694aa4a2f73ca96a20" + ] + ] + ], + "timestamp": "2023-11-06T09:26:10.349439801" + }, + "test_bam_stats_samtools_paired_end_cram_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,2cf2fe93596ee3d74f946097b204a629" + ] + ] + ], + "timestamp": "2023-12-04T11:07:22.30295557" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml b/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml new file mode 100644 index 00000000..ec2f2d68 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/bam_stats_samtools: + - subworkflows/nf-core/bam_stats_samtools/** diff --git a/workflows/circdna.nf b/workflows/circdna.nf index 6dd647e6..d4da910a 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -1,21 +1,19 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) -// Validate input parameters +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation WorkflowCircdna.initialise(params, log) -// Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } - -// Check mandatory parameters -if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Fasta reference genome not specified!' } if (!(params.input_format == "FASTQ" | params.input_format == "BAM")) { @@ -41,22 +39,25 @@ if (run_unicycler && !params.input_format == "FASTQ") { exit 1, 'Unicycler needs FastQ input. Please specify input_format == "FASTQ", if possible, or don`t run unicycler.' } -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (!params.input) { exit 1, 'Input samplesheet not specified!' } // Check if BWA Index is given if (params.bwa_index) { - ch_bwa_index = Channel.fromPath(params.bwa_index).collect() + ch_bwa_index = Channel.fromPath(params.bwa_index, type: 'dir').collect() + ch_bwa_index = ch_bwa_index.map{ index -> ["bwa_index", index] }.collect() bwa_index_exists = true - } else { - ch_bwa_index = Channel.empty() - bwa_index_exists = false - } +} else { + ch_bwa_index = Channel.empty() + bwa_index_exists = false +} // AMPLICON ARCHITECT INPUT if (run_ampliconarchitect) { - mosek_license_dir = file(params.mosek_license_dir) - if (!mosek_license_dir.exists()) { + mosek_license_dir = params.mosek_license_dir + if (!params.mosek_license_dir) { exit 1, "Mosek License Directory is missing! Please specifiy directory containing mosek license using --mosek_license_dir and rename license to 'mosek.lic'." + } else { + mosek_license_dir = file(params.mosek_license_dir) } if (!params.aa_data_repo) { exit 1, "AmpliconArchitect Data Repository Missing! Please see https://github.com/jluebeck/AmpliconArchitect for more information and specify its absolute path using --aa_data_repo." } if (params.reference_build != "hg19" & params.reference_build != "GRCh38" & params.reference_build != "GRCh37" & params.reference_build != "mm10"){ @@ -148,16 +149,7 @@ include { CIRCLEFINDER } from '../modules/local include { CIRCEXPLORER2_PARSE } from '../modules/local/circexplorer2/parse.nf' // AmpliconArchitect -include { CNVKIT_BATCH } from '../modules/local/cnvkit/batch/main.nf' -include { CNVKIT_SEGMENT } from '../modules/local/cnvkit/segment.nf' -include { PREPAREAA } from '../modules/local/ampliconsuite/prepareaa.nf' -include { COLLECT_SEEDS } from '../modules/local/collect_seeds.nf' -include { AMPLIFIED_INTERVALS } from '../modules/local/amplified_intervals.nf' -include { AMPLICONARCHITECT_AMPLICONARCHITECT } from '../modules/local/ampliconarchitect/ampliconarchitect.nf' -include { AMPLICONCLASSIFIER_AMPLICONCLASSIFIER } from '../modules/local/ampliconclassifier/ampliconclassifier.nf' -include { AMPLICONCLASSIFIER_AMPLICONSIMILARITY } from '../modules/local/ampliconclassifier/ampliconsimilarity.nf' -include { AMPLICONCLASSIFIER_MAKEINPUT } from '../modules/local/ampliconclassifier/makeinput.nf' -include { AMPLICONCLASSIFIER_MAKERESULTSTABLE } from '../modules/local/ampliconclassifier/makeresultstable.nf' +include { AMPLICONSUITE } from '../modules/local/ampliconsuite/ampliconsuite.nf' // Unicycler include { UNICYCLER } from '../modules/local/unicycler/main.nf' @@ -167,7 +159,7 @@ include { MINIMAP2_ALIGN } from '../modules/nf-core/minimap2/align/main // MULTIQC -include { MULTIQC } from '../modules/local/multiqc.nf' +include { MULTIQC } from '../modules/local/multiqc/main.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -196,7 +188,7 @@ workflow CIRCDNA { // SUBWORKFLOW: Read in samplesheet, validate and stage input files // INPUT_CHECK ( - ch_input + file(params.input) ) .reads .map { @@ -294,7 +286,7 @@ workflow CIRCDNA { } else if (params.input_format == "BAM") { // Use BAM Files as input INPUT_CHECK ( - ch_input + file(params.input) ) if (!params.bam_sorted){ SAMTOOLS_SORT_BAM ( @@ -400,85 +392,14 @@ workflow CIRCDNA { } if (run_ampliconarchitect) { - CNVKIT_BATCH ( - ch_bam_sorted.join(ch_bam_sorted_bai), - ch_fasta, - ch_cnvkit_reference - ) - ch_versions = ch_versions.mix(CNVKIT_BATCH.out.versions) - - CNVKIT_SEGMENT ( - CNVKIT_BATCH.out.cnr + AMPLICONSUITE ( + ch_bam_sorted, + file(params.mosek_license_dir), + file(params.aa_data_repo) ) - ch_versions = ch_versions.mix(CNVKIT_SEGMENT.out.versions) - - // PREPAREAA ( - // ch_bam_sorted.join(CNVKIT_SEGMENT.out.cns) - // ) - // ch_versions = ch_versions.mix(PREPAREAA.out.versions) - COLLECT_SEEDS ( - CNVKIT_SEGMENT.out.cns - ) - ch_versions = ch_versions.mix(COLLECT_SEEDS.out.versions) - - ch_aa_seeds = COLLECT_SEEDS.out.bed - AMPLIFIED_INTERVALS ( - ch_aa_seeds.join(ch_bam_sorted).join(ch_bam_sorted_bai) - ) - ch_versions = ch_versions.mix(AMPLIFIED_INTERVALS.out.versions) - - AMPLICONARCHITECT_AMPLICONARCHITECT ( - ch_bam_sorted.join(ch_bam_sorted_bai). - join(AMPLIFIED_INTERVALS.out.bed) - ) - - // AMPLICONARCHITECT_AMPLICONARCHITECT ( - // ch_bam_sorted.join(ch_bam_sorted_bai). - // join(PREPAREAA.out.bed) - // ) - ch_versions = ch_versions.mix(AMPLICONARCHITECT_AMPLICONARCHITECT.out.versions) - - ch_aa_cycles = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cycles. - map {meta, path -> [path]} - ch_aa_graphs = AMPLICONARCHITECT_AMPLICONARCHITECT.out.graph. - map {meta, path -> [path]} - - AMPLICONCLASSIFIER_MAKEINPUT ( - ch_aa_graphs.flatten().collect().ifEmpty([]), - ch_aa_cycles.flatten().collect().ifEmpty([]) - ) - - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( - AMPLICONCLASSIFIER_MAKEINPUT.out.input - ) - ac_input_ch = AMPLICONCLASSIFIER_MAKEINPUT.out.input - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) - AMPLICONCLASSIFIER_AMPLICONSIMILARITY ( - ac_input_ch - ) - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONSIMILARITY.out.versions) - - ac_input_ch. - map {file -> ["group", file]}. - set {ac_results_input_ch} - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv. - map {file -> ["group", file]}. - set {ac_class_ch} - // ac_results_input_ch.join(ac_class_ch). - // map{group, input_file, class_file -> [input_file, class_file]} - - AMPLICONCLASSIFIER_MAKERESULTSTABLE ( - ac_input_ch, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.gene_list, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.entropy, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.basic_properties, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.bed_files - ) - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_MAKERESULTSTABLE.out.versions) + ch_versions = ch_versions.mix(AMPLICONSUITE.out.versions) } - // // SUBWORKFLOW - RUN CIRCLE_FINDER PIPELINE // @@ -596,7 +517,7 @@ workflow CIRCDNA { MINIMAP2_ALIGN ( ch_circular_fastq, - ch_fasta, + ch_fasta_meta, false, false, false @@ -615,9 +536,16 @@ workflow CIRCDNA { // MODULE: MultiQC // if (!params.skip_multiqc) { - workflow_summary = WorkflowCircdna.paramsSummaryMultiqc(workflow, summary_params) + workflow_summary = WorkflowCircdna.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) + methods_description = WorkflowCircdna.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + ch_methods_description = Channel.value(methods_description) + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + MULTIQC ( ch_multiqc_config, ch_multiqc_custom_config.collect().ifEmpty([]), @@ -649,12 +577,20 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log) if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } } +workflow.onError { + if (workflow.errorReport.contains("Process requirement exceeds available memory")) { + println("🛑 Default resources exceed availability 🛑 ") + println("💡 See here on how to configure pipeline: https://nf-co.re/docs/usage/configuration#tuning-workflow-resources 💡") + } +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END
    Process Name \\", + " \\ Software Version
    CUSTOM_DUMPSOFTWAREVERSIONSpython3.11.7
    yaml5.4.1
    TOOL1tool10.11.9
    TOOL2tool21.9
    WorkflowNextflow
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls