Skip to content

vcf_header

Declare Vcf object.

Classes:

  • VcfHeader

    Object that parse and store vcf information.

VcfHeader

VcfHeader()

Object that parse and store vcf information.

Methods:

Attributes:

  • contigs (Iterator[str]) –

    Get an iterator of line contains chromosomes information.

  • samples_index (dict[str, int] | None) –

    Read vcf header to generate an association map between sample name and index.

Source code in src/variantplaner/objects/vcf_header.py
27
28
29
def __init__(self):
    """Initialise VcfHeader."""
    self._header = []

contigs cached property

contigs: Iterator[str]

Get an iterator of line contains chromosomes information.

Returns: String iterator

samples_index cached property

samples_index: dict[str, int] | None

Read vcf header to generate an association map between sample name and index.

Args: header: Header string.

Returns: Map that associate a sample name to is sample index.

Raises: NotVcfHeaderError: If all line not start by '#CHR'

column_name

column_name(
    number_of_column: int = MINIMAL_COL_NUMBER,
) -> Iterator[str]

Get an iterator of correct column name.

Returns: String iterator

Source code in src/variantplaner/objects/vcf_header.py
225
226
227
228
229
230
231
232
233
234
235
236
def column_name(self, number_of_column: int = MINIMAL_COL_NUMBER) -> typing.Iterator[str]:
    """Get an iterator of correct column name.

    Returns: String iterator
    """
    base_col_name = ["chr", "pos", "vid", "ref", "alt", "qual", "filter", "info"]

    yield from base_col_name

    if number_of_column > MINIMAL_COL_NUMBER and (samples := self.samples_index):
        yield "format"
        yield from (sample for (sample, _) in samples.items())

format_parser

format_parser(
    select_format: set[str] | None = None,
) -> dict[str, Callable[[Expr, str], Expr]]

Generate a list of polars.Expr to extract genotypes information.

Warning: Float values can't be converted for the moment they are stored as String to keep information

Args: header: Line of vcf header. input_path: Path to vcf file. select_format: List of target format field.

Returns: A dict to link format id to pipeable function with Polars.Expr

Raises: NotVcfHeaderError: If all line not start by '#CHR'

Source code in src/variantplaner/objects/vcf_header.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
def format_parser(
    self,
    select_format: set[str] | None = None,
) -> dict[str, typing.Callable[[polars.Expr, str], polars.Expr]]:
    """Generate a list of [polars.Expr](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/index.html) to extract genotypes information.

    **Warning**: Float values can't be converted for the moment they are stored as String to keep information

    Args:
    header: Line of vcf header.
    input_path: Path to vcf file.
    select_format: List of target format field.

    Returns:
    A dict to link format id to pipeable function with Polars.Expr

    Raises:
    NotVcfHeaderError: If all line not start by '#CHR'
    """
    format_re = re.compile(
        "ID=(?P<id>[A-Za-z_][0-9A-Za-z_.]*),Number=(?P<number>[ARG0-9\\.]+),Type=(?P<type>Integer|Float|String|Character)",
    )

    expressions: dict[str, typing.Callable[[polars.Expr, str], polars.Expr]] = {}

    for line in self._header:
        if line.startswith("#CHROM"):
            return expressions

        if not line.startswith("##FORMAT"):
            continue

        if (search := format_re.search(line)) and (not select_format or search["id"] in select_format):
            name = search["id"]
            number = search["number"]
            format_type = search["type"]

            if name == "GT":
                expressions["GT"] = VcfHeader.__format_gt
                continue

            if number == "1":
                if format_type == "Integer":
                    expressions[name] = VcfHeader.__format_one_int
                elif format_type == "Float":  # noqa: SIM114 Float isn't already support but in future
                    expressions[name] = VcfHeader.__format_one_str
                elif format_type in {"String", "Character"}:
                    expressions[name] = VcfHeader.__format_one_str
                else:
                    pass  # Not reachable

            elif format_type == "Integer":
                expressions[name] = VcfHeader.__format_list_int
            elif format_type == "Float":  # noqa: SIM114 Float isn't already support but in future
                expressions[name] = VcfHeader.__format_list_str
            elif format_type in {"String", "Character"}:
                expressions[name] = VcfHeader.__format_list_str
            else:
                pass  # Not reachable

    raise NotVcfHeaderError

from_files

from_files(path: Path) -> None

Populate VcfHeader object with content of only header file.

Args: path: Path of file

Returns: None

Source code in src/variantplaner/objects/vcf_header.py
31
32
33
34
35
36
37
38
39
40
41
42
43
def from_files(self, path: pathlib.Path) -> None:
    """Populate VcfHeader object with content of only header file.

    Args:
    path: Path of file

    Returns:
    None
    """
    with open(path) as fh:
        for full_line in fh:
            line = full_line.strip()
            self._header.append(line)

from_lines

from_lines(lines: Iterator[str]) -> None

Extract all header information of vcf lines.

Line between start of file and first line start with '#CHROM' or not start with '#'

Args: lines: Iterator of line

Returns: None

Raises: NotAVcfHeader: If a line not starts with '#' NotAVcfHeader: If no line start by '#CHROM'

Source code in src/variantplaner/objects/vcf_header.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def from_lines(self, lines: typing.Iterator[str]) -> None:
    """Extract all header information of vcf lines.

    Line between start of file and first line start with '#CHROM' or not start with '#'

    Args:
    lines: Iterator of line

    Returns: None

    Raises:
    NotAVcfHeader: If a line not starts with '#'
    NotAVcfHeader: If no line start by '#CHROM'
    """
    for full_line in lines:
        line = full_line.strip()

        if not line.startswith("#"):
            raise NotVcfHeaderError

        if line.startswith("#CHROM"):
            self._header.append(line)
            return

        self._header.append(line)

    raise NotVcfHeaderError

info_parser

info_parser(
    select_info: set[str] | None = None,
) -> list[Expr]

Generate a list of polars.Expr to extract variants information.

Args: header: Line of vcf header input_path: Path to vcf file. select_info: List of target info field

Returns: List of polars.Expr to parse info columns.

Raises: NotVcfHeaderError: If all line not start by '#CHR'

Source code in src/variantplaner/objects/vcf_header.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def info_parser(self, select_info: set[str] | None = None) -> list[polars.Expr]:
    """Generate a list of [polars.Expr](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/index.html) to extract variants information.

    Args:
    header: Line of vcf header
    input_path: Path to vcf file.
    select_info: List of target info field

    Returns:
    List of [polars.Expr](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/index.html) to parse info columns.

    Raises:
    NotVcfHeaderError: If all line not start by '#CHR'
    """
    info_re = re.compile(
        r"ID=(?P<id>([A-Za-z_][0-9A-Za-z_.]*|1000G)),Number=(?P<number>[ARG0-9\.]+),Type=(?P<type>Integer|Float|String|Character)",
    )

    expressions: list[polars.Expr] = []

    for line in self._header:
        if line.startswith("#CHROM"):
            return expressions

        if not line.startswith("##INFO"):
            continue

        if (search := info_re.search(line)) and (not select_info or search["id"] in select_info):
            regex = rf"{search['id']}=([^;]+);?"

            local_expr = polars.col("info").str.extract(regex, 1)

            if search["number"] == "1":
                if search["type"] == "Integer":
                    local_expr = local_expr.cast(polars.Int64)
                elif search["type"] == "Float":
                    local_expr = local_expr.cast(polars.Float64)
                elif search["type"] in {"String", "Character"}:
                    pass  # Not do anything on string or character
                else:
                    pass  # Not reachable

            else:
                local_expr = local_expr.str.split(",")
                if search["type"] == "Integer":
                    local_expr = local_expr.cast(polars.List(polars.Int64))
                elif search["type"] == "Float":
                    local_expr = local_expr.cast(polars.List(polars.Float64))
                elif search["type"] in {"String", "Character"}:
                    pass  # Not do anything on string or character
                else:
                    pass  # Not reachable

            expressions.append(local_expr.alias(search["id"]))

    raise NotVcfHeaderError