Source code for crispio.map

"""Classes for representing guide RNA libraries."""

from collections.abc import Iterable, Iterator

from dataclasses import asdict, dataclass, field
import math

from bioino import FastaSequence, GffFile, GffLine
from carabiner import pprint_dict
from nemony import encode, hash as nm_hash
from streq import find_iupac, reverse_complement, which_re_sites
from tqdm.auto import tqdm

from .annotate import annotate_from_gff
from .features import get_context

[docs] @dataclass class GuideMatch: """Information of guide matching a genome. Attributes ========== pam_search : str IUPAC search string for PAM. guide_seq : str Guide spacer sequence. pam_seq : str Actual PAM sequence. pam_start : int Chromosome coordinate of PAM start. pam_end : int Chromosome coordinate of PAM end. length : int Length of guide. Examples ======== >>> GuideMatch(pam_search="NGG", guide_seq="ATCGATCG", pam_seq="CGG", pam_start=10, reverse=False) GuideMatch(pam_search='NGG', guide_seq='ATCGATCG', pam_seq='CGG', pam_start=10, reverse=False, guide_context_up=None, guide_context_down=None, pam_end=13, length=8, guide_start=2, guide_end=10) >>> GuideMatch(pam_search="NGG", guide_seq="ATCGATCG", pam_seq="CCG", pam_start=10, reverse=True) GuideMatch(pam_search='NGG', guide_seq='CGATCGAT', pam_seq='CGG', pam_start=10, reverse=True, guide_context_up=None, guide_context_down=None, pam_end=13, length=8, guide_start=13, guide_end=21) """ pam_search: str guide_seq: str pam_seq: str pam_start: int reverse: bool guide_context_up: str | None = field(init=False, default=None) guide_context_down: str | None = field(init=False, default=None) pam_end: int = field(init=False) length: int = field(init=False) guide_start: int = field(init=False) guide_end: int = field(init=False) def __post_init__(self): self.length = len(self.guide_seq) self.pam_end = self.pam_start + len(self.pam_seq) if not self.reverse: self.guide_start = self.pam_start - self.length self.guide_end = self.pam_start else: self.guide_start = self.pam_end self.guide_end = self.pam_end + self.length self.pam_seq = reverse_complement(self.pam_seq) self.guide_seq = reverse_complement(self.guide_seq) def __len__(self): return len(self.guide_seq) def __str__(self): return self.guide_seq
[docs] def as_dict(self): return asdict(self)
[docs] @dataclass class GuideMatchCollection: """Set of guides with the same sequence but potentially with multiple matches. Attributes ========== guide_seq : str Guide spacer sequence. pam_search : str IUPAC search string for PAM. matches : iterable of GuideMatch Objects with matching information. guide_name : str, optional Name or identifier of guide. """ guide_seq: str pam_search: str matches: Iterable[GuideMatch] guide_name: str | None = field(default=None) def __iter__(self): try: for match in self.matches: yield match except ValueError as e: raise e def __len__(self): """Give the number of matches. If the matches are not yet instantiated, this will instantiate the matches. Depending on the size of the search, it can take memory and time. """ try: return len(self.matches) except TypeError: self.matches = list(self.matches) return len(self.matches) @staticmethod def _from_search( guide_seq: str, genome: str, pam_search: str = "NGG" ) -> Iterator[GuideMatch]: from streq import Circular pam_len = len(pam_search) guide_len = len(guide_seq) genome_len = len(genome) for reverse in (False, True): if reverse: this_guide = reverse_complement(guide_seq) this_pam = reverse_complement(pam_search) else: this_guide = guide_seq this_pam = pam_search pos = genome.find(this_guide) if pos == -1: continue while pos > -1 and pos < genome_len: if reverse: pam_start = pos - pam_len if not isinstance(genome, Circular) and pam_start < 0: pos = genome.find(this_guide, pos + 1) continue pam_candidate = genome[pam_start : pos] else: pam_start = pos + guide_len pam_candidate = genome[pam_start : pam_start + pam_len] for i, (_, pam_seq) in enumerate(find_iupac(this_pam, pam_candidate)): gm = GuideMatch( pam_search=pam_search, pam_seq=pam_seq, guide_seq=guide_seq, pam_start=pam_start, reverse=reverse, ) guide_down, guide_up = get_context( gm.pam_start, gm.pam_end, gm.guide_start, gm.guide_end, genome=genome, reverse=reverse, ) gm.guide_context_down = guide_down gm.guide_context_up = guide_up yield gm pos = genome.find(this_guide, pos + 1)
[docs] @dataclass class GuideLibrary: """Library of guides from a genome. Attributes ========== genome : str Genome sequence that guides are matched to. guide_matches : list of GuideMatchCollection List of matches to the genome. """ genome: str guide_matches: Iterable[GuideMatchCollection] def __iter__(self): try: for match in self.guide_matches: yield match except ValueError as e: raise e def __len__(self): """Give the number of matches. If the matches are not yet instantiated, this will instantiate the matches. Depending on the size of the search, it can take memory and time. """ try: return len(self.guide_matches) except TypeError: self.guide_matches = list(self.guide_matches) return len(self.guide_matches)
[docs] def as_gff( self, max_per_collection: int | None = None, annotations_from: GffFile | None = None, tags: Iterable[str] | None = None, gff_defaults: dict[str, str | int] | None = None ) -> Iterator[GffLine]: """Convert into a iterable of `bioino.GffLine`s. Parameters ========== max : int, optional Number of `bioino.GffLine`s to return for each `GuideMatchCollection`. Default: return all. annotations_from : bioino.GffFile, optional If provided use the `lookup` table to annotate the returned `GffLine`s. tags : list of str, optional Which tags to take from `annotations_from`. gff_defaults : dict In case of missing values that are essential for GFF file formats (namely columns 1-8), take values from this disctionary. Yields ====== bioino.GffLine Corresponding to a `GuideMatch`. Examples ======== >>> genome = "ATATATATATATATATATATATATACCGTTTTTTTAAAAAAACGGATATATATATATAATATATATATATAATATATATATATA" >>> lib = GuideLibrary.from_generating(genome=genome) >>> for gff in lib.as_gff(gff_defaults=dict(seqid="my_seq", source="here", feature="protospacer")): # doctest: +NORMALIZE_WHITESPACE ... print(gff) ... my_seq here protospacer 23 42 . + . ID=sgr-06a4ba9b;Name=42-united_exodus;guide_context_down=ATATATATATATAATATATA;guide_context_up=ATATATATATATATATATAT;guide_length=20;guide_re_sites=;guide_sequence=ATACCGTTTTTTTAAAAAAA;guide_sequence_hash=a3987295;mnemonic=united_exodus;pam_end=45;pam_replichore=L;pam_search=NGG;pam_sequence=CGG;pam_start=42;source_name=42-united_exodus my_seq here protospacer 29 48 . - . ID=sgr-f84d1c6a;Name=25-zigzag_state;guide_context_down=TATATATATATATATATATA;guide_context_up=ATATATATATTATATATATA;guide_length=20;guide_re_sites=;guide_sequence=TATCCGTTTTTTTAAAAAAA;guide_sequence_hash=188c9ee6;mnemonic=zigzag_state;pam_end=28;pam_replichore=R;pam_search=NGG;pam_sequence=CGG;pam_start=25;source_name=25-zigzag_state The ``seqid`` supplied in ``gff_defaults`` propagates to every output ``GffLine``. This is the mechanism used by the multi-chromosome CLI path to tag each guide with the chromosome it was found on: >>> genome = ('ATATATATATATATATATATATATACCGTTTTTTTAAAAAAACGG' ... 'ATATATATATATAATATATATATATAATATATATATATA') >>> lib = GuideLibrary.from_generating(genome=genome, in_memory=True) >>> defaults = dict(seqid='NC_000913.3', source='crispio', ... feature='protospacer', score='.', phase='.') >>> seqids = {line.columns.seqid for line in lib.as_gff(gff_defaults=defaults)} >>> seqids {'NC_000913.3'} """ genome_length = len(self.genome) max_per_collection = max_per_collection if max_per_collection is not None else math.inf gff_defaults = gff_defaults or {} for guide_match_collection in self: for i, match in enumerate(guide_match_collection.matches): if i >= max_per_collection: break sgrna_info = { "ID": "sgr-" + nm_hash((match.guide_seq, match.pam_search, match.pam_start), 8), "mnemonic": encode((match.guide_seq, match.pam_search, match.pam_start)), "guide_sequence_hash": nm_hash(match.guide_seq, 8), "source_name": guide_match_collection.guide_name, "pam_start": match.pam_start, "pam_end": match.pam_end, "pam_search": match.pam_search, "pam_sequence": match.pam_seq, "pam_replichore": "R" if ((match.pam_start / genome_length) < .5) else "L", "strand": ("+" if not match.reverse else "-"), "start": ((match.guide_start + 1) if (match.guide_start + 1) > 0 else match.guide_start + 1 + genome_length), "end": (match.guide_end if (match.guide_start + 1) > 0 else match.guide_end + genome_length), "guide_context_up": match.guide_context_up, "guide_context_down": match.guide_context_down, "guide_length": match.length, "guide_re_sites": ",".join(which_re_sites(match.guide_seq)), "guide_sequence": match.guide_seq } sgrna_info["Name"] = "{pam_start}-{mnemonic}".format(**sgrna_info) if annotations_from is not None: seqid = gff_defaults.get("seqid", "") sgrna_info = annotate_from_gff( sgrna_info, gff_data=annotations_from, seqid=seqid, tags=tags, ) sgrna_info["Name"] = "{ann_Name}-{pam_start}-{mnemonic}".format(**sgrna_info) sgrna_info.update(gff_defaults) sgrna_info["source_name"] = sgrna_info["source_name"] or sgrna_info["Name"] yield GffLine.from_dict(sgrna_info)
@staticmethod def _from_mapping( guide_seq: Iterable[FastaSequence], genome: str, pam_search: str = "NGG", limit: int | None = None ): not_found = {} limit = limit or 0 no_limit = limit is None or limit == 0 with tqdm(guide_seq) as t: ## run a progress bar for i, guide_sequence in enumerate(t): t.set_postfix( current=guide_sequence.name[:40], not_found=len(not_found), ) try: guide_matches = GuideMatchCollection.from_search( guide_seq=guide_sequence.sequence, guide_name=guide_sequence.name, pam_search=pam_search, genome=genome, ) except ValueError: not_found[guide_sequence.name] = guide_sequence.sequence else: if no_limit or i <= limit: yield guide_matches else: break pprint_dict( not_found, message=f"Not found: {len(not_found)} guides", ) return not_found
[docs] @classmethod def from_mapping( cls, guide_seq: str | Iterable[str] | FastaSequence | Iterable[FastaSequence], genome: str, pam_search: str = "NGG", in_memory: bool = False, limit: int | None = None ): """Map a set of expected guides to a genome. The default behavior is to find matches lazily to save memory and time. Parameters ========== guide_seq : str or bioino.FastaSequence or list Guides to map. genome : str Genome to map against. pam_search : str IUPAC PAM sequence to search against. in_memory : bool, optional Whether to instantiate matches in memory. Default: lazy matching. Returns ======= GuideLibrary Examples ======== >>> genome = "CCCCCCCCCCCTTTTTTTTTTAAAAAAAAAATGATCGATCGATCGAGGAAAAAAAAAACCCCCCCCCCC" >>> guide_seq = ["ATGATCGATCGATCG", "ATGATCGATCGATCGCCC"] >>> gl = GuideLibrary.from_mapping(guide_seq=guide_seq, genome=genome) >>> for collection in gl: ... for match in collection: ... print(match.as_dict()) ... {'pam_search': 'NGG', 'guide_seq': 'ATGATCGATCGATCG', 'pam_seq': 'AGG', 'pam_start': 45, 'reverse': False, 'guide_context_up': 'CTTTTTTTTTTAAAAAAAAA', 'guide_context_down': 'AAAAAAAAAACCCCCCCCCC', 'pam_end': 48, 'length': 15, 'guide_start': 30, 'guide_end': 45} """ if isinstance(guide_seq, str): guide_seq = [guide_seq] if isinstance(guide_seq, Iterable): new_guide_seq = [] for g in guide_seq: if isinstance(g, str): g = FastaSequence( name=g, description='query_spacer', sequence=g, ) new_guide_seq.append(g) guide_seq = new_guide_seq matches = cls._from_mapping( guide_seq=guide_seq, genome=genome, pam_search=pam_search, limit=limit, ) if in_memory: matches = list(matches) return cls( genome=genome, guide_matches=matches, )
@staticmethod def _from_generating( genome: str, max_length: int = 20, min_length: int | None = None, pam_search: str = "N", limit: int | None = None ) -> Iterator[GuideMatchCollection]: """ Examples ======== After the bug fix in ``_from_generating`` (always store canonical ``pam_search`` in ``GuideMatchCollection``), the collection and its contained ``GuideMatch`` must agree on ``pam_search`` regardless of strand: >>> genome = ('ATATATATATATATATATATATATACCGTTTTTTTAAAAAAACGG' ... 'ATATATATATATAATATATATATATAATATATATATATA') >>> gl = GuideLibrary.from_generating(genome=genome, in_memory=True) >>> for coll in gl.guide_matches: ... match = list(coll.matches)[0] ... assert coll.pam_search == match.pam_search, ( ... f"Inconsistent pam_search on reverse={match.reverse}: " ... f"collection={coll.pam_search!r} vs match={match.pam_search!r}" ... ) All pam_search values should be the canonical (forward-strand) PAM: >>> pam_searches = {coll.pam_search for coll in gl.guide_matches} >>> pam_searches {'NGG'} """ min_length = min_length or max_length found, guides_created = 0, 0 limit = limit or 0 no_limit = limit is None or limit == 0 for reverse in (False, True): directionality = "reverse" if reverse else "forward" _pam_search = reverse_complement(pam_search) if reverse else pam_search broken = False with tqdm(find_iupac(_pam_search, genome, overlapped=True)) as t: ## run a progress bar for i, ((pam_start, pam_end), pam_seq) in enumerate(t): found += 1 if no_limit or found <= limit: pass else: broken = True break for length in range(min_length, max_length + 1): guides_created += 1 guide_start = (pam_start - length if not reverse else pam_end) guide_end = (pam_start if not reverse else pam_end + length) guide_seq = genome[guide_start:guide_end] t.set_postfix(direction=directionality, at_site=pam_start, pam_sites_found=found, guides_created=guides_created) gm = GuideMatch( pam_search=pam_search, pam_seq=pam_seq, guide_seq=guide_seq, pam_start=pam_start, reverse=reverse, ) guide_down, guide_up = get_context( gm.pam_start, gm.pam_end, gm.guide_start, gm.guide_end, genome=genome, reverse=reverse, ) gm.guide_context_down = guide_down gm.guide_context_up = guide_up # TODO: Actually group by sequence yield GuideMatchCollection( guide_seq=guide_seq, pam_search=pam_search, ## store canonical PAM, not RC matches=[gm], ) if broken: break return guides_created
[docs] @classmethod def from_generating( cls, genome: str, max_length: int = 20, min_length: int | None = None, pam_search: str = "NGG", in_memory: bool = False, limit: int | None = None ): """Find all guides matching a PAM sequence in a given genome. The default behavior is to find matches lazily to save memory and time. Parameters ======-- genome : str Genome sequence to search. max_length : int, optional Maximum guide length. Default: 20. min_length : int, optional Minimum guide length. Default: same as max_length. pam_search : str, optional IUPAC PAM sequence to search for. Default: "NGG". in_memory : bool, optional Whether to instantiate matches in memory. Default: lazy matching. Examples ====== >>> genome = "ATATATATATATATATATATATATACCGTTTTTTTAAAAAAACGGATATATATATATAATATATATATATAATATATATATATA" >>> gl = GuideLibrary.from_generating(genome=genome) >>> len(gl) 2 >>> for match_collection in gl: ... for guide in match_collection: ... print(guide) ... ATACCGTTTTTTTAAAAAAA TATCCGTTTTTTTAAAAAAA """ matches = ( match for match in cls._from_generating( genome, max_length=max_length, min_length=min_length, pam_search=pam_search, limit=limit, ) ) if in_memory: matches = list(matches) return cls( genome=genome, guide_matches=matches, )