"""Classes for representing guide RNA libraries."""
from collections.abc import Iterable, Iterator
from dataclasses import asdict, dataclass, field
import math
from bioino import FastaSequence, GffFile, GffLine
from carabiner import pprint_dict
from nemony import encode, hash as nm_hash
from streq import find_iupac, reverse_complement, which_re_sites
from tqdm.auto import tqdm
from .annotate import annotate_from_gff
from .features import get_context
[docs]
@dataclass
class GuideMatch:
"""Information of guide matching a genome.
Attributes
==========
pam_search : str
IUPAC search string for PAM.
guide_seq : str
Guide spacer sequence.
pam_seq : str
Actual PAM sequence.
pam_start : int
Chromosome coordinate of PAM start.
pam_end : int
Chromosome coordinate of PAM end.
length : int
Length of guide.
Examples
========
>>> GuideMatch(pam_search="NGG", guide_seq="ATCGATCG", pam_seq="CGG", pam_start=10, reverse=False)
GuideMatch(pam_search='NGG', guide_seq='ATCGATCG', pam_seq='CGG', pam_start=10, reverse=False, guide_context_up=None, guide_context_down=None, pam_end=13, length=8, guide_start=2, guide_end=10)
>>> GuideMatch(pam_search="NGG", guide_seq="ATCGATCG", pam_seq="CCG", pam_start=10, reverse=True)
GuideMatch(pam_search='NGG', guide_seq='CGATCGAT', pam_seq='CGG', pam_start=10, reverse=True, guide_context_up=None, guide_context_down=None, pam_end=13, length=8, guide_start=13, guide_end=21)
"""
pam_search: str
guide_seq: str
pam_seq: str
pam_start: int
reverse: bool
guide_context_up: str | None = field(init=False, default=None)
guide_context_down: str | None = field(init=False, default=None)
pam_end: int = field(init=False)
length: int = field(init=False)
guide_start: int = field(init=False)
guide_end: int = field(init=False)
def __post_init__(self):
self.length = len(self.guide_seq)
self.pam_end = self.pam_start + len(self.pam_seq)
if not self.reverse:
self.guide_start = self.pam_start - self.length
self.guide_end = self.pam_start
else:
self.guide_start = self.pam_end
self.guide_end = self.pam_end + self.length
self.pam_seq = reverse_complement(self.pam_seq)
self.guide_seq = reverse_complement(self.guide_seq)
def __len__(self):
return len(self.guide_seq)
def __str__(self):
return self.guide_seq
[docs]
def as_dict(self):
return asdict(self)
[docs]
@dataclass
class GuideMatchCollection:
"""Set of guides with the same sequence but potentially with multiple
matches.
Attributes
==========
guide_seq : str
Guide spacer sequence.
pam_search : str
IUPAC search string for PAM.
matches : iterable of GuideMatch
Objects with matching information.
guide_name : str, optional
Name or identifier of guide.
"""
guide_seq: str
pam_search: str
matches: Iterable[GuideMatch]
guide_name: str | None = field(default=None)
def __iter__(self):
try:
for match in self.matches:
yield match
except ValueError as e:
raise e
def __len__(self):
"""Give the number of matches.
If the matches are not yet instantiated, this will instantiate the matches.
Depending on the size of the search, it can take memory and time.
"""
try:
return len(self.matches)
except TypeError:
self.matches = list(self.matches)
return len(self.matches)
@staticmethod
def _from_search(
guide_seq: str,
genome: str,
pam_search: str = "NGG"
) -> Iterator[GuideMatch]:
from streq import Circular
pam_len = len(pam_search)
guide_len = len(guide_seq)
genome_len = len(genome)
for reverse in (False, True):
if reverse:
this_guide = reverse_complement(guide_seq)
this_pam = reverse_complement(pam_search)
else:
this_guide = guide_seq
this_pam = pam_search
pos = genome.find(this_guide)
if pos == -1:
continue
while pos > -1 and pos < genome_len:
if reverse:
pam_start = pos - pam_len
if not isinstance(genome, Circular) and pam_start < 0:
pos = genome.find(this_guide, pos + 1)
continue
pam_candidate = genome[pam_start : pos]
else:
pam_start = pos + guide_len
pam_candidate = genome[pam_start : pam_start + pam_len]
for i, (_, pam_seq) in enumerate(find_iupac(this_pam, pam_candidate)):
gm = GuideMatch(
pam_search=pam_search,
pam_seq=pam_seq,
guide_seq=guide_seq,
pam_start=pam_start,
reverse=reverse,
)
guide_down, guide_up = get_context(
gm.pam_start,
gm.pam_end,
gm.guide_start,
gm.guide_end,
genome=genome,
reverse=reverse,
)
gm.guide_context_down = guide_down
gm.guide_context_up = guide_up
yield gm
pos = genome.find(this_guide, pos + 1)
[docs]
@classmethod
def from_search(
cls,
guide_seq: str,
genome: str,
pam_search: str = "NGG",
guide_name: str | None = None,
in_memory: bool = False
):
"""Find the location of a guide sequence in a genome.
Searches the genome in the forward strand then the reverse strand,
returning the match with an adjacent PAM in the order found.
The default behavior is to find matches lazily to save memory and time.
Parameters
==========
guide_seq : str
The sequence of the guide to be found.
pam_search : str, optional
The sequence (IUPAC codes allowed) of the PAM to match. Default: "NGG".
genome : str
The genome sequence to search.
guide_name : str
Name or identifier of guide.
Raises
======
ValueError
If guide not found in genome with appropriate PAM.
Returns
=======
GuideMatches
A iterator of dictionaries of match information.
Examples
========
>>> gmc = GuideMatchCollection.from_search("TTTTTTTAAAAAAA", "CCGTTTTTTTAAAAAAACGG")
>>> len(gmc)
2
>>> for match in gmc:
... print(match)
...
TTTTTTTAAAAAAA
TTTTTTTAAAAAAA
"""
if guide_seq not in genome and reverse_complement(guide_seq) not in genome:
raise ValueError(f"{guide_seq} not in genome")
matches = cls._from_search(guide_seq, genome, pam_search)
if in_memory:
matches = list(matches)
return cls(
pam_search=pam_search,
guide_seq=guide_seq,
guide_name=guide_name,
matches=matches,
)
[docs]
@dataclass
class GuideLibrary:
"""Library of guides from a genome.
Attributes
==========
genome : str
Genome sequence that guides are matched to.
guide_matches : list of GuideMatchCollection
List of matches to the genome.
"""
genome: str
guide_matches: Iterable[GuideMatchCollection]
def __iter__(self):
try:
for match in self.guide_matches:
yield match
except ValueError as e:
raise e
def __len__(self):
"""Give the number of matches.
If the matches are not yet instantiated, this will instantiate the matches.
Depending on the size of the search, it can take memory and time.
"""
try:
return len(self.guide_matches)
except TypeError:
self.guide_matches = list(self.guide_matches)
return len(self.guide_matches)
[docs]
def as_gff(
self,
max_per_collection: int | None = None,
annotations_from: GffFile | None = None,
tags: Iterable[str] | None = None,
gff_defaults: dict[str, str | int] | None = None
) -> Iterator[GffLine]:
"""Convert into a iterable of `bioino.GffLine`s.
Parameters
==========
max : int, optional
Number of `bioino.GffLine`s to return for each `GuideMatchCollection`. Default: return all.
annotations_from : bioino.GffFile, optional
If provided use the `lookup` table to annotate the returned `GffLine`s.
tags : list of str, optional
Which tags to take from `annotations_from`.
gff_defaults : dict
In case of missing values that are essential for GFF file formats
(namely columns 1-8), take values from this disctionary.
Yields
======
bioino.GffLine
Corresponding to a `GuideMatch`.
Examples
========
>>> genome = "ATATATATATATATATATATATATACCGTTTTTTTAAAAAAACGGATATATATATATAATATATATATATAATATATATATATA"
>>> lib = GuideLibrary.from_generating(genome=genome)
>>> for gff in lib.as_gff(gff_defaults=dict(seqid="my_seq", source="here", feature="protospacer")): # doctest: +NORMALIZE_WHITESPACE
... print(gff)
...
my_seq here protospacer 23 42 . + . ID=sgr-06a4ba9b;Name=42-united_exodus;guide_context_down=ATATATATATATAATATATA;guide_context_up=ATATATATATATATATATAT;guide_length=20;guide_re_sites=;guide_sequence=ATACCGTTTTTTTAAAAAAA;guide_sequence_hash=a3987295;mnemonic=united_exodus;pam_end=45;pam_replichore=L;pam_search=NGG;pam_sequence=CGG;pam_start=42;source_name=42-united_exodus
my_seq here protospacer 29 48 . - . ID=sgr-f84d1c6a;Name=25-zigzag_state;guide_context_down=TATATATATATATATATATA;guide_context_up=ATATATATATTATATATATA;guide_length=20;guide_re_sites=;guide_sequence=TATCCGTTTTTTTAAAAAAA;guide_sequence_hash=188c9ee6;mnemonic=zigzag_state;pam_end=28;pam_replichore=R;pam_search=NGG;pam_sequence=CGG;pam_start=25;source_name=25-zigzag_state
The ``seqid`` supplied in ``gff_defaults`` propagates to every output
``GffLine``. This is the mechanism used by the multi-chromosome CLI path
to tag each guide with the chromosome it was found on:
>>> genome = ('ATATATATATATATATATATATATACCGTTTTTTTAAAAAAACGG'
... 'ATATATATATATAATATATATATATAATATATATATATA')
>>> lib = GuideLibrary.from_generating(genome=genome, in_memory=True)
>>> defaults = dict(seqid='NC_000913.3', source='crispio',
... feature='protospacer', score='.', phase='.')
>>> seqids = {line.columns.seqid for line in lib.as_gff(gff_defaults=defaults)}
>>> seqids
{'NC_000913.3'}
"""
genome_length = len(self.genome)
max_per_collection = max_per_collection if max_per_collection is not None else math.inf
gff_defaults = gff_defaults or {}
for guide_match_collection in self:
for i, match in enumerate(guide_match_collection.matches):
if i >= max_per_collection:
break
sgrna_info = {
"ID": "sgr-" + nm_hash((match.guide_seq, match.pam_search, match.pam_start), 8),
"mnemonic": encode((match.guide_seq, match.pam_search, match.pam_start)),
"guide_sequence_hash": nm_hash(match.guide_seq, 8),
"source_name": guide_match_collection.guide_name,
"pam_start": match.pam_start,
"pam_end": match.pam_end,
"pam_search": match.pam_search,
"pam_sequence": match.pam_seq,
"pam_replichore": "R" if ((match.pam_start / genome_length) < .5) else "L",
"strand": ("+" if not match.reverse else "-"),
"start": ((match.guide_start + 1) if (match.guide_start + 1) > 0
else match.guide_start + 1 + genome_length),
"end": (match.guide_end if (match.guide_start + 1) > 0
else match.guide_end + genome_length),
"guide_context_up": match.guide_context_up,
"guide_context_down": match.guide_context_down,
"guide_length": match.length,
"guide_re_sites": ",".join(which_re_sites(match.guide_seq)),
"guide_sequence": match.guide_seq
}
sgrna_info["Name"] = "{pam_start}-{mnemonic}".format(**sgrna_info)
if annotations_from is not None:
seqid = gff_defaults.get("seqid", "")
sgrna_info = annotate_from_gff(
sgrna_info,
gff_data=annotations_from,
seqid=seqid,
tags=tags,
)
sgrna_info["Name"] = "{ann_Name}-{pam_start}-{mnemonic}".format(**sgrna_info)
sgrna_info.update(gff_defaults)
sgrna_info["source_name"] = sgrna_info["source_name"] or sgrna_info["Name"]
yield GffLine.from_dict(sgrna_info)
@staticmethod
def _from_mapping(
guide_seq: Iterable[FastaSequence],
genome: str,
pam_search: str = "NGG",
limit: int | None = None
):
not_found = {}
limit = limit or 0
no_limit = limit is None or limit == 0
with tqdm(guide_seq) as t: ## run a progress bar
for i, guide_sequence in enumerate(t):
t.set_postfix(
current=guide_sequence.name[:40],
not_found=len(not_found),
)
try:
guide_matches = GuideMatchCollection.from_search(
guide_seq=guide_sequence.sequence,
guide_name=guide_sequence.name,
pam_search=pam_search,
genome=genome,
)
except ValueError:
not_found[guide_sequence.name] = guide_sequence.sequence
else:
if no_limit or i <= limit:
yield guide_matches
else:
break
pprint_dict(
not_found,
message=f"Not found: {len(not_found)} guides",
)
return not_found
[docs]
@classmethod
def from_mapping(
cls,
guide_seq: str | Iterable[str] | FastaSequence | Iterable[FastaSequence],
genome: str,
pam_search: str = "NGG",
in_memory: bool = False,
limit: int | None = None
):
"""Map a set of expected guides to a genome.
The default behavior is to find matches lazily to save memory and time.
Parameters
==========
guide_seq : str or bioino.FastaSequence or list
Guides to map.
genome : str
Genome to map against.
pam_search : str
IUPAC PAM sequence to search against.
in_memory : bool, optional
Whether to instantiate matches in memory. Default: lazy matching.
Returns
=======
GuideLibrary
Examples
========
>>> genome = "CCCCCCCCCCCTTTTTTTTTTAAAAAAAAAATGATCGATCGATCGAGGAAAAAAAAAACCCCCCCCCCC"
>>> guide_seq = ["ATGATCGATCGATCG", "ATGATCGATCGATCGCCC"]
>>> gl = GuideLibrary.from_mapping(guide_seq=guide_seq, genome=genome)
>>> for collection in gl:
... for match in collection:
... print(match.as_dict())
...
{'pam_search': 'NGG', 'guide_seq': 'ATGATCGATCGATCG', 'pam_seq': 'AGG', 'pam_start': 45, 'reverse': False, 'guide_context_up': 'CTTTTTTTTTTAAAAAAAAA', 'guide_context_down': 'AAAAAAAAAACCCCCCCCCC', 'pam_end': 48, 'length': 15, 'guide_start': 30, 'guide_end': 45}
"""
if isinstance(guide_seq, str):
guide_seq = [guide_seq]
if isinstance(guide_seq, Iterable):
new_guide_seq = []
for g in guide_seq:
if isinstance(g, str):
g = FastaSequence(
name=g,
description='query_spacer',
sequence=g,
)
new_guide_seq.append(g)
guide_seq = new_guide_seq
matches = cls._from_mapping(
guide_seq=guide_seq,
genome=genome,
pam_search=pam_search,
limit=limit,
)
if in_memory:
matches = list(matches)
return cls(
genome=genome,
guide_matches=matches,
)
@staticmethod
def _from_generating(
genome: str,
max_length: int = 20,
min_length: int | None = None,
pam_search: str = "N",
limit: int | None = None
) -> Iterator[GuideMatchCollection]:
"""
Examples
========
After the bug fix in ``_from_generating`` (always store canonical
``pam_search`` in ``GuideMatchCollection``), the collection and its
contained ``GuideMatch`` must agree on ``pam_search`` regardless of strand:
>>> genome = ('ATATATATATATATATATATATATACCGTTTTTTTAAAAAAACGG'
... 'ATATATATATATAATATATATATATAATATATATATATA')
>>> gl = GuideLibrary.from_generating(genome=genome, in_memory=True)
>>> for coll in gl.guide_matches:
... match = list(coll.matches)[0]
... assert coll.pam_search == match.pam_search, (
... f"Inconsistent pam_search on reverse={match.reverse}: "
... f"collection={coll.pam_search!r} vs match={match.pam_search!r}"
... )
All pam_search values should be the canonical (forward-strand) PAM:
>>> pam_searches = {coll.pam_search for coll in gl.guide_matches}
>>> pam_searches
{'NGG'}
"""
min_length = min_length or max_length
found, guides_created = 0, 0
limit = limit or 0
no_limit = limit is None or limit == 0
for reverse in (False, True):
directionality = "reverse" if reverse else "forward"
_pam_search = reverse_complement(pam_search) if reverse else pam_search
broken = False
with tqdm(find_iupac(_pam_search, genome, overlapped=True)) as t: ## run a progress bar
for i, ((pam_start, pam_end), pam_seq) in enumerate(t):
found += 1
if no_limit or found <= limit:
pass
else:
broken = True
break
for length in range(min_length, max_length + 1):
guides_created += 1
guide_start = (pam_start - length if not reverse
else pam_end)
guide_end = (pam_start if not reverse else
pam_end + length)
guide_seq = genome[guide_start:guide_end]
t.set_postfix(direction=directionality,
at_site=pam_start,
pam_sites_found=found,
guides_created=guides_created)
gm = GuideMatch(
pam_search=pam_search,
pam_seq=pam_seq,
guide_seq=guide_seq,
pam_start=pam_start,
reverse=reverse,
)
guide_down, guide_up = get_context(
gm.pam_start,
gm.pam_end,
gm.guide_start,
gm.guide_end,
genome=genome,
reverse=reverse,
)
gm.guide_context_down = guide_down
gm.guide_context_up = guide_up
# TODO: Actually group by sequence
yield GuideMatchCollection(
guide_seq=guide_seq,
pam_search=pam_search, ## store canonical PAM, not RC
matches=[gm],
)
if broken:
break
return guides_created
[docs]
@classmethod
def from_generating(
cls,
genome: str,
max_length: int = 20,
min_length: int | None = None,
pam_search: str = "NGG",
in_memory: bool = False,
limit: int | None = None
):
"""Find all guides matching a PAM sequence in a given genome.
The default behavior is to find matches lazily to save memory and time.
Parameters
======--
genome : str
Genome sequence to search.
max_length : int, optional
Maximum guide length. Default: 20.
min_length : int, optional
Minimum guide length. Default: same as max_length.
pam_search : str, optional
IUPAC PAM sequence to search for. Default: "NGG".
in_memory : bool, optional
Whether to instantiate matches in memory. Default: lazy matching.
Examples
======
>>> genome = "ATATATATATATATATATATATATACCGTTTTTTTAAAAAAACGGATATATATATATAATATATATATATAATATATATATATA"
>>> gl = GuideLibrary.from_generating(genome=genome)
>>> len(gl)
2
>>> for match_collection in gl:
... for guide in match_collection:
... print(guide)
...
ATACCGTTTTTTTAAAAAAA
TATCCGTTTTTTTAAAAAAA
"""
matches = (
match for match in cls._from_generating(
genome,
max_length=max_length,
min_length=min_length,
pam_search=pam_search,
limit=limit,
)
)
if in_memory:
matches = list(matches)
return cls(
genome=genome,
guide_matches=matches,
)