Skip to content

Commit 7135f11

Browse files
authored
Merge pull request #19 from cfe-lab/SeparateCombinedStandardsLogic
Separate logic that combines matching allele pairs into its own method
2 parents 67ba4f6 + d2f39b9 commit 7135f11

6 files changed

Lines changed: 455 additions & 52 deletions

File tree

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
fail-fast: false
2323
matrix:
2424
os: [ubuntu-latest]
25-
python-version: ["3.10", "3.11", "3.12", "3.x"]
25+
python-version: ["3.11", "3.12", "3.13"]
2626

2727
steps:
2828
- uses: actions/checkout@v4

pyproject.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "hatchling.build"
66
name = "hla_algorithm"
77
description = 'Python implementation of the BC-CfE HLA interpretation algorithm'
88
readme = "README.md"
9-
requires-python = ">=3.10"
9+
requires-python = ">=3.11,<=3.13"
1010
license = "MIT"
1111
keywords = []
1212
authors = [
@@ -17,7 +17,6 @@ authors = [
1717
classifiers = [
1818
"Development Status :: 5 - Production/Stable",
1919
"Programming Language :: Python",
20-
"Programming Language :: Python :: 3.10",
2120
"Programming Language :: Python :: 3.11",
2221
"Programming Language :: Python :: 3.12",
2322
"Programming Language :: Python :: 3.13",

src/hla_algorithm/hla_algorithm.py

Lines changed: 60 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from io import TextIOBase
55
from operator import attrgetter
66
from pathlib import Path
7-
from typing import Final, Optional, TypedDict, cast
7+
from typing import Final, Optional, TypedDict
88

99
import numpy as np
1010
import yaml
@@ -18,12 +18,12 @@
1818
HLASequence,
1919
HLAStandard,
2020
HLAStandardMatch,
21+
MatchingAllelePair,
2122
)
2223
from .utils import (
2324
BIN2NUC,
2425
HLA_LOCUS,
2526
StoredHLAStandards,
26-
allele_coordinates_sort_key,
2727
count_strict_mismatches,
2828
nuc2bin,
2929
sort_allele_pairs,
@@ -138,7 +138,9 @@ def load_default_hla_standards() -> LoadedStandards:
138138
:return: List of known HLA standards
139139
:rtype: list[HLAStandard]
140140
"""
141-
with open(HLAAlgorithm.DEFAULT_CONFIG_DIR / "hla_standards.yaml") as standards_file:
141+
with open(
142+
HLAAlgorithm.DEFAULT_CONFIG_DIR / "hla_standards.yaml"
143+
) as standards_file:
142144
return HLAAlgorithm.read_hla_standards(standards_file)
143145

144146
FREQUENCY_LOCUS_COLUMNS: dict[HLA_LOCUS, tuple[str, str]] = {
@@ -230,16 +232,24 @@ def combine_standards_stepper(
230232
matching_stds: Sequence[HLAStandardMatch],
231233
seq: Sequence[int],
232234
mismatch_threshold: int = 0,
233-
) -> Generator[tuple[tuple[int, ...], int, tuple[str, str]], None, None]:
235+
) -> Generator[MatchingAllelePair, None, None]:
234236
"""
235237
Identifies "good" combined standards for the specified sequence.
236238
239+
Humans have two copies of their HLA genes, so when we use Sanger
240+
sequencing to sequence a person's HLA, we get a single sequence with
241+
potentially many mixtures. That is, at any position that the two genes
242+
don't match, we see a nucleotide mixture consisting of the two
243+
corresponding bases.
244+
245+
In order to find matches, we take allele sequences (reduced to ones that
246+
are already "decent" matches for our sequence, to reduce running time)
247+
and "mush" them together to produce potential matches for our sequence.
248+
237249
On each iteration, it continues checking combined standards until it
238-
finds a "match", and yields a tuple containing the details of that
239-
match:
240-
- the combined standard, as a tuple of integers 0-15;
241-
- the number of mismatches identified; and
242-
- the allele pair (i.e. names of the two alleles in the combination).
250+
finds a "match", and yields a MatchingAllelePair containing its details.
251+
252+
PRECONDITION: matching_stds should contain no duplicates.
243253
244254
A "match" is defined by the number of mismatches between the combined
245255
standard and the sequence:
@@ -263,15 +273,6 @@ def combine_standards_stepper(
263273
# "Mush" the two standards together to produce something
264274
# that looks like what you get when you sequence HLA.
265275
std_bin = np.array(std_b.sequence) | np.array(std_a.sequence)
266-
allele_pair: tuple[str, str] = cast(
267-
tuple[str, str],
268-
tuple(
269-
sorted(
270-
(std_a.allele, std_b.allele),
271-
key=allele_coordinates_sort_key,
272-
)
273-
),
274-
)
275276

276277
# There could be more than one combined standard with the
277278
# same sequence, so check if this one's already been found.
@@ -291,28 +292,19 @@ def combine_standards_stepper(
291292
elif mismatches < current_rejection_threshold:
292293
current_rejection_threshold = max(mismatches, mismatch_threshold)
293294

294-
yield (combined_std_bin, mismatches, allele_pair)
295+
yield MatchingAllelePair.create_from_unsorted_alleles(
296+
standard_bin=combined_std_bin,
297+
mismatch_count=mismatches,
298+
allele_names=(std_a.allele, std_b.allele),
299+
)
295300

296301
@staticmethod
297-
def combine_standards(
298-
matching_stds: Sequence[HLAStandardMatch],
299-
seq: Sequence[int],
302+
def collate_matching_allele_pairs(
303+
matching_allele_pairs: Iterable[MatchingAllelePair],
300304
mismatch_threshold: Optional[int] = None,
301305
) -> dict[HLACombinedStandard, int]:
302306
"""
303-
Find the combinations of standards that match the given sequence.
304-
305-
Humans have two copies of their HLA genes, so when we use Sanger
306-
sequencing to sequence a person's HLA, we get a single sequence with
307-
potentially many mixtures. That is, at any position that the two genes
308-
don't match, we see a nucleotide mixture consisting of the two
309-
corresponding bases.
310-
311-
In order to find matches, we take allele sequences (reduced to ones that
312-
are already "decent" matches for our sequence, to reduce running time)
313-
and "mush" them together to produce potential matches for our sequence.
314-
315-
PRECONDITION: matching_stds should contain no duplicates.
307+
Collate the given MatchingAllelePairs into HLACombinedStandards.
316308
317309
Returns a dictionary mapping HLACombinedStandards to their mismatch
318310
counts. If mismatch_threshold is None or 0, then the result contains
@@ -330,13 +322,13 @@ def combine_standards(
330322
combos: dict[tuple[int, ...], tuple[int, list[tuple[str, str]]]] = {}
331323

332324
fewest_mismatches: int | float = float("inf")
333-
for (
334-
combined_std_bin,
335-
mismatches,
336-
allele_pair,
337-
) in HLAAlgorithm.combine_standards_stepper(
338-
matching_stds, seq, mismatch_threshold
339-
):
325+
for matching_allele_pair in matching_allele_pairs:
326+
combined_std_bin: tuple[int, ...] = matching_allele_pair.standard_bin
327+
mismatches: int = matching_allele_pair.mismatch_count
328+
allele_pair: tuple[str, str] = (
329+
matching_allele_pair.allele_1,
330+
matching_allele_pair.allele_2,
331+
)
340332
if combined_std_bin not in combos:
341333
combos[combined_std_bin] = (mismatches, [])
342334
combos[combined_std_bin][1].append(allele_pair)
@@ -362,6 +354,32 @@ def combine_standards(
362354

363355
return result
364356

357+
@staticmethod
358+
def combine_standards(
359+
matching_stds: Sequence[HLAStandardMatch],
360+
seq: Sequence[int],
361+
mismatch_threshold: Optional[int] = None,
362+
) -> dict[HLACombinedStandard, int]:
363+
"""
364+
Find the combinations of standards that match the given sequence.
365+
366+
This uses combine_standards_stepper to find any putative matches, and
367+
then uses collate_matching_allele_pairs to compile the information into
368+
a dictionary mapping HLACombinedStandards to their mismatch counts.
369+
370+
The parameters are as for combine_standards_stepper; mismatch_threshold
371+
is also fed directly into collate_matching_allele_pairs and affects the
372+
results accordingly.
373+
"""
374+
return HLAAlgorithm.collate_matching_allele_pairs(
375+
HLAAlgorithm.combine_standards_stepper(
376+
matching_stds,
377+
seq,
378+
mismatch_threshold if mismatch_threshold is not None else 0,
379+
),
380+
mismatch_threshold,
381+
)
382+
365383
@staticmethod
366384
def get_mismatches(
367385
standard_bin: Sequence[int],

src/hla_algorithm/models.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
from collections.abc import Iterable
33
from dataclasses import dataclass, field
44
from operator import itemgetter
5-
from typing import Final, Optional
5+
from typing import Final, Optional, Self
66

77
import numpy as np
8-
from pydantic import BaseModel, ConfigDict
8+
from pydantic import BaseModel, ConfigDict, model_validator
99

1010
from .utils import (
1111
HLA_LOCUS,
@@ -75,6 +75,45 @@ class HLAStandardMatch(HLAStandard):
7575
mismatch: int
7676

7777

78+
class MatchingAllelePair(BaseModel):
79+
"""
80+
Represents an allele pair that matches an observed sequence.
81+
82+
This contains:
83+
- the combined standard, as a tuple of integers 0-15;
84+
- the number of mismatches identified; and
85+
- the allele pair (i.e. names of the two alleles in the combination).
86+
"""
87+
standard_bin: tuple[int, ...]
88+
mismatch_count: int
89+
allele_1: str
90+
allele_2: str
91+
92+
@model_validator(mode="after")
93+
def check_alleles_ordered(self) -> Self:
94+
if allele_coordinates_sort_key(self.allele_1) > allele_coordinates_sort_key(self.allele_2):
95+
raise ValueError("allele_1 should be less than or equal to allele_2")
96+
return self
97+
98+
@classmethod
99+
def create_from_unsorted_alleles(
100+
cls,
101+
standard_bin: tuple[int, ...],
102+
mismatch_count: int,
103+
allele_names: tuple[str, str],
104+
) -> Self:
105+
sorted_allele_names: list[str] = sorted(
106+
allele_names,
107+
key=allele_coordinates_sort_key,
108+
)
109+
return cls(
110+
standard_bin=standard_bin,
111+
mismatch_count=mismatch_count,
112+
allele_1=sorted_allele_names[0],
113+
allele_2=sorted_allele_names[1],
114+
)
115+
116+
78117
class HLACombinedStandard(BaseModel):
79118
"""
80119
Represents a combined HLA standard and all of its possible combinations.

0 commit comments

Comments
 (0)