Skip to content

normalization

Function use to normalize data.

Functions:

add_id_part

add_id_part(
    lf: LazyFrame, number_of_bits: int = 8
) -> LazyFrame

Add column id part.

If id is large variant id value, id_part are set to 255, other value most weigthed position 8 bits are use.

Parameters:

Returns:

Source code in src/variantplaner/normalization.py
78
79
80
81
82
83
84
85
86
87
88
89
def add_id_part(lf: polars.LazyFrame, number_of_bits: int = 8) -> polars.LazyFrame:
    """Add column id part.

    If id is large variant id value, id_part are set to 255, other value most weigthed position 8 bits are use.

    Args:
        lf: [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) contains: id column.

    Returns:
        [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) with column id_part added
    """
    return lf.with_columns(id_part=polars.col("id").variant_id.partition(number_of_bits=number_of_bits))  # type: ignore # noqa: PGH003

add_variant_id

add_variant_id(
    lf: LazyFrame, chrom2length: LazyFrame
) -> LazyFrame

Add a column id of variants.

Id computation is based on

Two different algorithms are used to calculate the variant identifier, depending on the cumulative length of the reference and alternative sequences.

If the cumulative length of the reference and alternative sequences is short, the leftmost bit of the id is set to 0, then a unique 63-bit hash of the variant is calculated.

If the cumulative length of the reference and alternative sequences is long, the right-most bit of the id will have a value of 1, followed by a hash function, used in Firefox, of the chromosome, position, reference and alternative sequence without the right-most bit.

If lf.columns contains SVTYPE and SVLEN variant with regex group in alt <([^:]+).*> match SVTYPE are replaced by concatenation of SVTYPE and SVLEN first value.

Parameters:

Returns:

Source code in src/variantplaner/normalization.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def add_variant_id(lf: polars.LazyFrame, chrom2length: polars.LazyFrame) -> polars.LazyFrame:
    """Add a column id of variants.

    Id computation is based on

    Two different algorithms are used to calculate the variant identifier, depending on the cumulative length of the reference and alternative sequences.

    If the cumulative length of the reference and alternative sequences is short, the leftmost bit of the id is set to 0, then a unique 63-bit hash of the variant is calculated.

    If the cumulative length of the reference and alternative sequences is long, the right-most bit of the id will have a value of 1, followed by a hash function, used in Firefox, of the chromosome, position, reference and alternative sequence without the right-most bit.

    If lf.columns contains SVTYPE and SVLEN variant with regex group in alt <([^:]+).*> match SVTYPE are replaced by concatenation of SVTYPE and SVLEN first value.

    Args:
        lf: [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) contains: chr, pos, ref, alt columns.
        chrom2length: [polars.DataFrame](https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/index.html) contains: chr and length columns.

    Returns:
        [polars.LazyFrame](https://pola-rs.github.io/polars/py-polars/html/reference/lazyframe/index.html) with chr column normalized
    """
    real_pos_max = chrom2length.select([polars.col("length").sum()]).collect().get_column("length").max()

    large_variant_len = (64 - len(format(real_pos_max, "b")) - 2) // 2 + 1

    col_names = lf.collect_schema().names()
    if "SVTYPE" in col_names and "SVLEN" in col_names:
        lf = lf.with_columns(
            alt=polars.when(
                polars.col("alt").str.replace("<(?<type>[^:]+).*>", "$type") == polars.col("SVTYPE"),
            )
            .then(
                polars.col("alt")
                .str.replace(
                    ".+",
                    polars.concat_str(
                        [polars.col("SVTYPE"), polars.col("SVLEN").list.get(0)],
                        separator="-",
                    ),
                )
                .str.pad_end(large_variant_len, "-"),
            )
            .otherwise(
                polars.col("alt"),
            ),
        )

    lf = lf.with_columns(alt=polars.col("alt").str.replace("\\*", "*" * large_variant_len))
    lf = lf.join(chrom2length, right_on="contig", left_on="chr", how="left", coalesce=True)
    lf = lf.with_columns(real_pos=polars.col("pos") + polars.col("offset"))

    lf = lf.with_columns(
        id=polars.col("real_pos").variant_id.compute(  # type: ignore # noqa: PGH003
            polars.col("ref"),
            polars.col("alt"),
            real_pos_max,
        ),
    )

    return lf.drop(["real_pos", "length", "offset"])