Documentation updated (tutorial+api)

pyranges · Dec 23, 2024 · 470d714 · 470d714
1 parent 323b922
commit 470d714
Show file tree

Hide file tree

Showing 11 changed files with 231 additions and 131 deletions.
diff --git a/docs/api_reference.rst b/docs/api_reference.rst
@@ -5,6 +5,8 @@ API reference
 #. :doc:`Setting variables <./prp_settings>` the variables to be set: engine(compulsory), id_col, theme and warnings
 #. :doc:`Register plot <./prp_registerplot>`
 #. :doc:`Customization options <./prp_options>`
+#. :doc:`VCF tools <./prp_vcf>`
+#. :doc:`Scatterplot creation <./prp_scatter>`
 
 .. toctree::
    :maxdepth: 2
@@ -13,4 +15,6 @@ API reference
    prp_plot
    prp_settings
    prp_registerplot
-   prp_options
+   prp_options
+   prp_vcf
+   prp_scatter
diff --git a/docs/conf.py b/docs/conf.py
@@ -12,7 +12,10 @@
 #
 from docutils import nodes
 import sphinx_rtd_theme
+import os
+import sys
 
+sys.path.insert(0, os.path.abspath("../src"))
 
 # -- Project information -----------------------------------------------------
 

diff --git a/docs/images/prp_rtd_23.png b/docs/images/prp_rtd_23.png
diff --git a/docs/images/prp_rtd_24.png b/docs/images/prp_rtd_24.png
diff --git a/docs/images/prp_rtd_25.png b/docs/images/prp_rtd_25.png
diff --git a/docs/images/prp_rtd_26.png b/docs/images/prp_rtd_26.png
diff --git a/docs/prp_scatter.rst b/docs/prp_scatter.rst
@@ -0,0 +1,9 @@
+Scatterplot creation
+--------------------
+
+Creates a Scatterplot on PyRanges objects.
+
+.. automodule:: pyranges_plot
+    :members:
+    :imported-members:  # Ensure this is set to include imported members
+    :exclude-members: set_engine, get_engine, set_id_col, get_id_col, set_theme, get_theme,set_warnings, get_warnings, plot, print_options, set_options, reset_options, register_plot, ncbi_gff, ncbi_vcf
diff --git a/docs/prp_vcf.rst b/docs/prp_vcf.rst
@@ -0,0 +1,10 @@
+VCF tools
+---------
+
+Functions related to loading, processing, and transforming VCF (Variant Call Format) files. These tools allow for efficient 
+reading of VCF files into PyRanges objects and flexible manipulation of their fields. For further explanation check the 
+**Dealing with VCF files** section of the :ref:`tutorial <tutorial>`.
+
+.. automodule:: pyranges_plot.vcf
+    :members:
+    :imported-members: 
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
diff --git a/src/pyranges_plot/vcf/vcf_preparation.py b/src/pyranges_plot/vcf/vcf_preparation.py
@@ -7,26 +7,26 @@ def split_fields(data, target_cols: str | list, field_sep: str, col_name_sep: st
 
     Parameters
     ----------
-    data (pd.DataFrame): 
+    data: pd.DataFrame
             The input DataFrame containing the columns to be split.
-    target_cols (str | list): 
+    target_cols: {str or list of strings}
             Column name(s) in the DataFrame to be split. Can be a single column (str) 
             or a list of column names.
-    field_sep (str): 
+    field_sep: str
             Separator used to split the fields in the target column(s).
-    col_name_sep (str | None, optional): 
+    col_name_sep: str, default None
             If provided, this separator is used to split each field into a column name 
             and value. For example, `"key=value"` will generate a column named `key` 
             with the corresponding `value`. Defaults to None.
-    col_names (list[str] | None, optional): 
+    col_names: list[str], default None 
             A list of names for the new columns. If not provided, column names are 
             generated automatically based on the target column name and field index. 
             If `col_name_sep` is specified, the column names can be inferred from the field keys. 
             Defaults to None.
-    col_types (list[str] | None, optional): 
+    col_types: list[str], default None 
             A list of data types for the new columns. If not provided, columns will 
             retain their default inferred types. Defaults to None.
-    keep_col (bool, optional): 
+    keep_col: bool , default False
             Whether to retain the original target column(s) in the output DataFrame. 
             Defaults to False (the original column(s) will be removed).
     
@@ -43,8 +43,40 @@ def split_fields(data, target_cols: str | list, field_sep: str, col_name_sep: st
             If the number of provided `col_names` does not match the number of new columns generated.
         ValueError: 
             If the number of provided `col_types` does not match the number of new columns generated.
-
     
+    Example
+    -------
+    >>> vcf = prp.example_data.ncbi_vcf()
+    >>> vcf
+    index    |    Chromosome    Start     ID            REF       ALT       QUAL      FILTER      ...
+    int64    |    object        int32     object        object    object    object    category    ...
+    -------  ---  ------------  --------  ------------  --------  --------  --------  ----------  -----
+    0        |    1             943995    rs761448939   C         G,T       nan       .           ...
+    1        |    1             964512    rs756054473   C         A,T       nan       .           ...
+    2        |    1             976215    rs7417106     A         C,G,T     nan       .           ...
+    3        |    1             1013983   rs1644247121  G         A         nan       .           ...
+    ...      |    ...           ...       ...           ...       ...       ...       ...         ...
+    242182   |    Y             2787592   rs104894975   A         T         nan       .           ...
+    242183   |    Y             2787600   rs104894977   G         A         nan       .           ...
+    242184   |    Y             7063898   rs199659121   A         T         nan       .           ...
+    242185   |    Y             12735725  rs778145751   TAAGT     T         nan       .           ...
+    PyRanges with 242186 rows, 9 columns, and 1 index columns. (2 columns not shown: "INFO", "End").
+    Contains 25 chromosomes.
+    >>> prp.vcf.split_fields(vcf,target_cols="INFO",field_sep=";",col_name_sep="=")
+    index    |    Chromosome    Start     ID            REF       ALT       QUAL      FILTER      End       INFO_0     TSA       INFO_2                  INFO_3                  ...
+    int64    |    object        int32     object        object    object    object    category    int32     object     object    object                  object                  ...
+    -------  ---  ------------  --------  ------------  --------  --------  --------  ----------  --------  ---------  --------  ----------------------  ----------------------  -----
+    0        |    1             943995    rs761448939   C         G,T       nan       .           943996    dbSNP_156  SNV       E_Freq                  E_Cited                 ...
+    1        |    1             964512    rs756054473   C         A,T       nan       .           964513    dbSNP_156  SNV       E_Freq                  E_Cited                 ...
+    2        |    1             976215    rs7417106     A         C,G,T     nan       .           976216    dbSNP_156  SNV       E_Freq                  E_1000G                 ...
+    3        |    1             1013983   rs1644247121  G         A         nan       .           1013984   dbSNP_156  SNV       E_Phenotype_or_Disease  CLIN_pathogenic         ...
+    ...      |    ...           ...       ...           ...       ...       ...       ...         ...       ...        ...       ...                     ...                     ...
+    242182   |    Y             2787592   rs104894975   A         T         nan       .           2787593   dbSNP_156  SNV       E_Cited                 E_Phenotype_or_Disease  ...
+    242183   |    Y             2787600   rs104894977   G         A         nan       .           2787601   dbSNP_156  SNV       E_Cited                 E_Phenotype_or_Disease  ...
+    242184   |    Y             7063898   rs199659121   A         T         nan       .           7063899   dbSNP_156  SNV       E_Freq                  E_Cited                 ...
+    242185   |    Y             12735725  rs778145751   TAAGT     T         nan       .           12735726  dbSNP_156  indel     E_Freq                  E_Cited                 ...
+    PyRanges with 242186 rows, 31 columns, and 1 index columns. (19 columns not shown: "INFO_4", "INFO_5", "INFO_6", ...).
+    Contains 25 chromosomes.
     """
     result_data = data.copy()
     if isinstance(target_cols, str):

diff --git a/src/pyranges_plot/vcf/vcf_reader.py b/src/pyranges_plot/vcf/vcf_reader.py
@@ -4,6 +4,63 @@
 from io import StringIO
 
 def read_vcf(f: str | Path, nrows: bool | None = None):
+    """
+    Read a VCF (Variant Call Format) file and convert it into a PyRanges object.
+
+    This function processes a VCF file by reading the data, extracting the header and
+    data lines, and creating a PyRanges object for genomic analysis. The metadata
+    lines (lines starting with '##') are ignored, and the column names are extracted 
+    from the header line (starting with '#CHROM').
+
+    Parameters
+    ----------
+    f : str | Path
+        The file path to the VCF file to be read.
+    nrows : bool | None, optional
+        The number of rows to read from the file. If None, reads the entire file.
+
+    Returns
+    -------
+    pr.PyRanges
+        A PyRanges object containing the VCF data, adding the following columns:
+        - Chromosome: Chromosome names (from 'CHROM' in the VCF).
+        - Start: Start positions of variants (from 'POS' in the VCF).
+        - End: End positions of variants (calculated as Start + 1).
+
+    Raises
+    ------
+    FileNotFoundError
+        If the provided file path does not exist.
+    ValueError
+        If the VCF file is malformed or missing essential fields.
+
+    Notes
+    -----
+    - Missing quality scores ('.') are replaced with pandas.NA.
+    - The function reads the file in chunks for large VCF files to handle memory usage.
+    - Columns 'CHROM' and 'POS' are renamed to 'Chromosome' and 'Start' respectively, 
+      to align with PyRanges conventions.
+
+    Examples
+    --------
+    >>> vcf_pyranges = prp.vcf.read_vcf("example.vcf")
+    >>> vcf_ranges
+    index    |    Chromosome    Start    ID          REF       ALT       QUAL      FILTER      INFO                       End
+    int64    |    category      int32    category    object    object    object    category    object                     int32
+    -------  ---  ------------  -------  ----------  --------  --------  --------  ----------  -------------------------  -------
+    0        |    1             500      .           A         T         <NA>      PASS        TRANSCRIPT=t1;SECOND_ID=a  501
+    1        |    1             3500     .           A         T         <NA>      PASS        TRANSCRIPT=t1;SECOND_ID=a  3501
+    2        |    1             300      .           A         T         <NA>      PASS        TRANSCRIPT=t2;SECOND_ID=a  301
+    3        |    1             1300     .           A         T         <NA>      PASS        TRANSCRIPT=t2;SECOND_ID=a  1301
+    ...      |    ...           ...      ...         ...       ...       ...       ...         ...                        ...
+    5        |    1             4500     .           A         T         <NA>      PASS        TRANSCRIPT=t3;SECOND_ID=b  4501
+    6        |    1             4900     .           A         T         <NA>      PASS        TRANSCRIPT=t3;SECOND_ID=b  4901
+    7        |    1             5600     .           A         T         <NA>      PASS        TRANSCRIPT=t3;SECOND_ID=b  5601
+    8        |    1             6000     .           A         T         <NA>      PASS        TRANSCRIPT=t4;SECOND_ID=b  6001
+    PyRanges with 9 rows, 9 columns, and 1 index columns.
+    Contains 1 chromosomes.
+    """
+
     path = Path(f)
     dtypes = {
         "CHROM": "category",