44from io import TextIOBase
55from operator import attrgetter
66from pathlib import Path
7- from typing import Final , Optional , TypedDict , cast
7+ from typing import Final , Optional , TypedDict
88
99import numpy as np
1010import yaml
1818 HLASequence ,
1919 HLAStandard ,
2020 HLAStandardMatch ,
21+ MatchingAllelePair ,
2122)
2223from .utils import (
2324 BIN2NUC ,
2425 HLA_LOCUS ,
2526 StoredHLAStandards ,
26- allele_coordinates_sort_key ,
2727 count_strict_mismatches ,
2828 nuc2bin ,
2929 sort_allele_pairs ,
@@ -138,7 +138,9 @@ def load_default_hla_standards() -> LoadedStandards:
138138 :return: List of known HLA standards
139139 :rtype: list[HLAStandard]
140140 """
141- with open (HLAAlgorithm .DEFAULT_CONFIG_DIR / "hla_standards.yaml" ) as standards_file :
141+ with open (
142+ HLAAlgorithm .DEFAULT_CONFIG_DIR / "hla_standards.yaml"
143+ ) as standards_file :
142144 return HLAAlgorithm .read_hla_standards (standards_file )
143145
144146 FREQUENCY_LOCUS_COLUMNS : dict [HLA_LOCUS , tuple [str , str ]] = {
@@ -230,16 +232,24 @@ def combine_standards_stepper(
230232 matching_stds : Sequence [HLAStandardMatch ],
231233 seq : Sequence [int ],
232234 mismatch_threshold : int = 0 ,
233- ) -> Generator [tuple [ tuple [ int , ...], int , tuple [ str , str ]] , None , None ]:
235+ ) -> Generator [MatchingAllelePair , None , None ]:
234236 """
235237 Identifies "good" combined standards for the specified sequence.
236238
239+ Humans have two copies of their HLA genes, so when we use Sanger
240+ sequencing to sequence a person's HLA, we get a single sequence with
241+ potentially many mixtures. That is, at any position that the two genes
242+ don't match, we see a nucleotide mixture consisting of the two
243+ corresponding bases.
244+
245+ In order to find matches, we take allele sequences (reduced to ones that
246+ are already "decent" matches for our sequence, to reduce running time)
247+ and "mush" them together to produce potential matches for our sequence.
248+
237249 On each iteration, it continues checking combined standards until it
238- finds a "match", and yields a tuple containing the details of that
239- match:
240- - the combined standard, as a tuple of integers 0-15;
241- - the number of mismatches identified; and
242- - the allele pair (i.e. names of the two alleles in the combination).
250+ finds a "match", and yields a MatchingAllelePair containing its details.
251+
252+ PRECONDITION: matching_stds should contain no duplicates.
243253
244254 A "match" is defined by the number of mismatches between the combined
245255 standard and the sequence:
@@ -263,15 +273,6 @@ def combine_standards_stepper(
263273 # "Mush" the two standards together to produce something
264274 # that looks like what you get when you sequence HLA.
265275 std_bin = np .array (std_b .sequence ) | np .array (std_a .sequence )
266- allele_pair : tuple [str , str ] = cast (
267- tuple [str , str ],
268- tuple (
269- sorted (
270- (std_a .allele , std_b .allele ),
271- key = allele_coordinates_sort_key ,
272- )
273- ),
274- )
275276
276277 # There could be more than one combined standard with the
277278 # same sequence, so check if this one's already been found.
@@ -291,28 +292,19 @@ def combine_standards_stepper(
291292 elif mismatches < current_rejection_threshold :
292293 current_rejection_threshold = max (mismatches , mismatch_threshold )
293294
294- yield (combined_std_bin , mismatches , allele_pair )
295+ yield MatchingAllelePair .create_from_unsorted_alleles (
296+ standard_bin = combined_std_bin ,
297+ mismatch_count = mismatches ,
298+ allele_names = (std_a .allele , std_b .allele ),
299+ )
295300
296301 @staticmethod
297- def combine_standards (
298- matching_stds : Sequence [HLAStandardMatch ],
299- seq : Sequence [int ],
302+ def collate_matching_allele_pairs (
303+ matching_allele_pairs : Iterable [MatchingAllelePair ],
300304 mismatch_threshold : Optional [int ] = None ,
301305 ) -> dict [HLACombinedStandard , int ]:
302306 """
303- Find the combinations of standards that match the given sequence.
304-
305- Humans have two copies of their HLA genes, so when we use Sanger
306- sequencing to sequence a person's HLA, we get a single sequence with
307- potentially many mixtures. That is, at any position that the two genes
308- don't match, we see a nucleotide mixture consisting of the two
309- corresponding bases.
310-
311- In order to find matches, we take allele sequences (reduced to ones that
312- are already "decent" matches for our sequence, to reduce running time)
313- and "mush" them together to produce potential matches for our sequence.
314-
315- PRECONDITION: matching_stds should contain no duplicates.
307+ Collate the given MatchingAllelePairs into HLACombinedStandards.
316308
317309 Returns a dictionary mapping HLACombinedStandards to their mismatch
318310 counts. If mismatch_threshold is None or 0, then the result contains
@@ -330,13 +322,13 @@ def combine_standards(
330322 combos : dict [tuple [int , ...], tuple [int , list [tuple [str , str ]]]] = {}
331323
332324 fewest_mismatches : int | float = float ("inf" )
333- for (
334- combined_std_bin ,
335- mismatches ,
336- allele_pair ,
337- ) in HLAAlgorithm . combine_standards_stepper (
338- matching_stds , seq , mismatch_threshold
339- ):
325+ for matching_allele_pair in matching_allele_pairs :
326+ combined_std_bin : tuple [ int , ...] = matching_allele_pair . standard_bin
327+ mismatches : int = matching_allele_pair . mismatch_count
328+ allele_pair : tuple [ str , str ] = (
329+ matching_allele_pair . allele_1 ,
330+ matching_allele_pair . allele_2 ,
331+ )
340332 if combined_std_bin not in combos :
341333 combos [combined_std_bin ] = (mismatches , [])
342334 combos [combined_std_bin ][1 ].append (allele_pair )
@@ -362,6 +354,32 @@ def combine_standards(
362354
363355 return result
364356
357+ @staticmethod
358+ def combine_standards (
359+ matching_stds : Sequence [HLAStandardMatch ],
360+ seq : Sequence [int ],
361+ mismatch_threshold : Optional [int ] = None ,
362+ ) -> dict [HLACombinedStandard , int ]:
363+ """
364+ Find the combinations of standards that match the given sequence.
365+
366+ This uses combine_standards_stepper to find any putative matches, and
367+ then uses collate_matching_allele_pairs to compile the information into
368+ a dictionary mapping HLACombinedStandards to their mismatch counts.
369+
370+ The parameters are as for combine_standards_stepper; mismatch_threshold
371+ is also fed directly into collate_matching_allele_pairs and affects the
372+ results accordingly.
373+ """
374+ return HLAAlgorithm .collate_matching_allele_pairs (
375+ HLAAlgorithm .combine_standards_stepper (
376+ matching_stds ,
377+ seq ,
378+ mismatch_threshold if mismatch_threshold is not None else 0 ,
379+ ),
380+ mismatch_threshold ,
381+ )
382+
365383 @staticmethod
366384 def get_mismatches (
367385 standard_bin : Sequence [int ],
0 commit comments