Skip to content

extract

Extract information of polars.LazyFrame produce from raw vcf file parsing.

genotypes

genotypes(lf: LazyFrame, col2expr: dict[str, Callable[[Expr, str], Expr]], format_str: str = 'GT:AD:DP:GQ') -> LazyFrame

Extract genotypes information of raw polars.LazyFrame.

Only line with format value match format_str are considered.

Parameters:

  • lf (LazyFrame) –

    The target polars.LazyFrame

  • col2expr (dict[str, Callable[[Expr, str], Expr]]) –

    A dict associate column name and function to apply to create polars.LazyFrame column (produce by io.vcf.format2expr)

  • format_str (str, default: 'GT:AD:DP:GQ' ) –

    Only variants match with this string format are considered

Returns:

  • LazyFrame

    A polars.LazyFrame with variant id, sample information and genotypes information

Raises:

Source code in src/variantplaner/extract.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def genotypes(
    lf: polars.LazyFrame,
    col2expr: dict[str, Callable[[polars.Expr, str], polars.Expr]],
    format_str: str = "GT:AD:DP:GQ",
) -> polars.LazyFrame:
    """Extract genotypes information of raw [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html).

    Only line with format value match `format_str` are considered.

    Args:
        lf: The target [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html)
        col2expr: A dict associate column name and function to apply to create [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) column (produce by io.vcf.format2expr)
        format_str: Only variants match with this string format are considered

    Returns:
        A [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) with variant id, sample information and genotypes information

    Raises:
        NoGenotypeError: If none of the lf columns is equal to 'format'
    """
    if "format" not in lf.columns:
        raise NoGenotypeError

    lf = lf.select([*lf.columns[lf.columns.index("format") :]])

    # Clean bad variant
    lf = lf.filter(polars.col("format").str.starts_with(format_str)).select(*lf.columns[1:])

    # Found index of genotype value
    col_index = {
        key: index
        for (index, key) in enumerate(
            format_str.split(":"),
        )
    }

    # Pivot value
    genotypes = lf.melt(id_vars=["id"]).with_columns(
        [
            polars.col("id"),
            polars.col("variable").alias("sample"),
            polars.col("value").str.split(":"),
        ],
    )

    # Split genotype column in sub value
    genotypes = genotypes.with_columns(
        [polars.col("value").list.get(index).pipe(function=col2expr[col], name=col) for col, index in col_index.items()],  # type: ignore # noqa: PGH003
    )

    # Select intrusting column
    genotypes = genotypes.select(["id", "sample", *[col.lower() for col in col_index]])

    if "gt" in genotypes.columns:
        return genotypes.filter(polars.col("gt") != 0)

    return genotypes

merge_variants_genotypes

merge_variants_genotypes(variants_lf: LazyFrame, genotypes_lf: LazyFrame, sample_name: list[str]) -> LazyFrame

Merge variants and genotypes polars.LazyFrame.

Parameters:

  • variants_lf (LazyFrame) –

    lazyframe with variants, column: (id, chr, pos, ref, alt).

  • genotypes_lf (LazyFrame) –

    lazyframe with genotypes, column: (id, sample, [genotype column]).

Returns:

  • LazyFrame

    A lazyframe with all data

Source code in src/variantplaner/extract.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def merge_variants_genotypes(
    variants_lf: polars.LazyFrame,
    genotypes_lf: polars.LazyFrame,
    sample_name: list[str],
) -> polars.LazyFrame:
    """Merge variants and genotypes [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html).

    Args:
       variants_lf: lazyframe with variants, column: (id, chr, pos, ref, alt).
       genotypes_lf: lazyframe with genotypes, column: (id, sample, [genotype column]).

    Returns:
        A lazyframe with all data
    """
    for sample in sample_name:
        geno2sample = (
            genotypes_lf.filter(polars.col("sample") == sample)
            .rename(
                {col: f"{sample}_{col}" for col in genotypes_lf.columns[2:]},
            )
            .drop("sample")
        )
        variants_lf = variants_lf.join(geno2sample, on="id", how="outer_coalesce")

    return variants_lf

variants

variants(lf: LazyFrame) -> LazyFrame

Extract variants only information of polars.LazyFrame.

Parameters:

Returns:

  • LazyFrame

    A polars.LazyFrame with just variant information (id, chr, pos, ref, alt)

Source code in src/variantplaner/extract.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def variants(lf: polars.LazyFrame) -> polars.LazyFrame:
    """Extract variants only information of polars.LazyFrame.

    Args:
        lf: A [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html)

    Returns:
        A [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) with just variant information (id, chr, pos, ref, alt)
    """
    return lf.select(
        [
            polars.col("id"),
            polars.col("chr"),
            polars.col("pos"),
            polars.col("ref"),
            polars.col("alt"),
        ],
    )