"""Tools for annotating guide RNAs from GFF data"""
from typing import Dict, Iterable, Mapping, Optional, Union
from dataclasses import asdict
from bioino import GffFile
from carabiner import print_err
_TAGS = (
"Name",
"locus_tag",
"gene_biotype",
)
[docs]
def annotate_from_gff(
sgRNA: Mapping[str, Union[str, int]],
gff_data: GffFile,
seqid: str,
tags: Optional[Iterable[str]] = None,
) -> Dict[str, Union[str, int]]:
"""Annotate dictionary of guide information with GFF annotations.
Dictionary must at least have key 'pam_start' and 'pam_end' mapping to
numerical values.
Parameters
==========
sgRNA : dict
Dictionary containing 'pam_start' and 'pam_end', and optionally other
information about a guide.
gff_data : bioino.GffFile
GffFile object which was loaded with `lookup=True`.
tags : list of str, optional
Which GFF tags to extract from attributes of GFF features.
Returns
=======
dict
Guide RNA dictionary updated with GFF annotations.
Examples
========
Set up a minimal single-gene GFF and build its lookup table:
>>> from io import StringIO
>>> from bioino import GffFile
>>> gff_line = '\\t'.join([
... 'chr1', 'RefSeq', 'gene', '100', '500', '.', '+', '.',
... 'ID=g1;Name=geneA;locus_tag=b0001',
... ])
>>> gff = GffFile.from_file(StringIO(gff_line), lookup=True)
Guide PAM midpoint at position 251, inside the gene body (offset from gene
start = 251 - 100 = 151):
>>> from crispio.annotate import annotate_from_gff
>>> result = annotate_from_gff({'pam_start': 250, 'pam_end': 253}, gff, seqid='chr1')
>>> result['ann_Name']
'geneA'
>>> result['ann_locus_tag']
'b0001'
>>> result['pam_offset']
151
>>> result['ann_strand']
'+'
>>> result['ann_start'], result['ann_end']
(100, 500)
Intergenic guide (PAM midpoint 61, upstream of gene start): bioino 0.0.3
automatically assigns the ``_up-`` prefix and computes the distance to the
nearest feature, so no manual prefix logic is needed in crispio:
>>> result2 = annotate_from_gff({'pam_start': 60, 'pam_end': 63}, gff, seqid='chr1')
>>> result2['ann_locus_tag']
'_up-geneA'
>>> result2['pam_offset']
39
Unknown ``seqid`` (e.g. a plasmid not in the GFF) returns the input dict
unchanged rather than raising:
>>> result3 = annotate_from_gff({'pam_start': 250, 'pam_end': 253}, gff, seqid='chrX')
>>> sorted(result3.keys())
['pam_end', 'pam_start']
Custom tag set — only extract ``Name``:
>>> result4 = annotate_from_gff({'pam_start': 250, 'pam_end': 253}, gff,
... seqid='chr1', tags=['Name'])
>>> 'ann_Name' in result4
True
>>> 'ann_locus_tag' in result4
False
"""
tags = tags or _TAGS
pam_loc = (
sgRNA['pam_start']
+ abs(sgRNA['pam_start']
- sgRNA['pam_end']) // 2
)
results = gff_data.lookup_at(seqid, pam_loc)
if not results:
print_err(f"Warning: locus {pam_loc} on {seqid!r} not covered by GFF. Skipping annotation.")
return sgRNA
annotation_matches = results[0]
# bioino 0.0.3 already bakes _up-/_down- into locus_tag for intergenic positions
for tag in tags:
try:
sgRNA[f'ann_{tag}'] = annotation_matches.attributes[tag]
except KeyError:
pass
sgRNA["pam_offset"] = annotation_matches.attributes["offset"]
sgRNA.update({
f"ann_{header}": val
for header, val in asdict(annotation_matches.columns).items()
})
return sgRNA