Source code for crispio.annotate

"""Tools for annotating guide RNAs from GFF data"""

from typing import Dict, Iterable, Mapping, Optional, Union

from dataclasses import asdict

from bioino import GffFile
from carabiner import print_err

_TAGS = (
    "Name", 
    "locus_tag", 
    "gene_biotype",
)
            

[docs]
def annotate_from_gff(
    sgRNA: Mapping[str, Union[str, int]], 
    gff_data: GffFile,
    seqid: str,
    tags: Optional[Iterable[str]] = None,
) -> Dict[str, Union[str, int]]:
    
    """Annotate dictionary of guide information with GFF annotations.

    Dictionary must at least have key 'pam_start' and 'pam_end' mapping to 
    numerical values.

    Parameters
    ==========
    sgRNA : dict
        Dictionary containing 'pam_start' and 'pam_end', and optionally other
        information about a guide.
    gff_data : bioino.GffFile
        GffFile object which was loaded with `lookup=True`.
    tags : list of str, optional
        Which GFF tags to extract from attributes of GFF features.

    Returns
    =======
    dict
        Guide RNA dictionary updated with GFF annotations.

    Examples
    ========
    Set up a minimal single-gene GFF and build its lookup table:
 
    >>> from io import StringIO
    >>> from bioino import GffFile
    >>> gff_line = '\\t'.join([
    ...     'chr1', 'RefSeq', 'gene', '100', '500', '.', '+', '.',
    ...     'ID=g1;Name=geneA;locus_tag=b0001',
    ... ])
    >>> gff = GffFile.from_file(StringIO(gff_line), lookup=True)
 
    Guide PAM midpoint at position 251, inside the gene body (offset from gene
    start = 251 - 100 = 151):
 
    >>> from crispio.annotate import annotate_from_gff
    >>> result = annotate_from_gff({'pam_start': 250, 'pam_end': 253}, gff, seqid='chr1')
    >>> result['ann_Name']
    'geneA'
    >>> result['ann_locus_tag']
    'b0001'
    >>> result['pam_offset']
    151
    >>> result['ann_strand']
    '+'
    >>> result['ann_start'], result['ann_end']
    (100, 500)
 
    Intergenic guide (PAM midpoint 61, upstream of gene start): bioino 0.0.3
    automatically assigns the ``_up-`` prefix and computes the distance to the
    nearest feature, so no manual prefix logic is needed in crispio:
 
    >>> result2 = annotate_from_gff({'pam_start': 60, 'pam_end': 63}, gff, seqid='chr1')
    >>> result2['ann_locus_tag']
    '_up-geneA'
    >>> result2['pam_offset']
    39
 
    Unknown ``seqid`` (e.g. a plasmid not in the GFF) returns the input dict
    unchanged rather than raising:
 
    >>> result3 = annotate_from_gff({'pam_start': 250, 'pam_end': 253}, gff, seqid='chrX')
    >>> sorted(result3.keys())
    ['pam_end', 'pam_start']
 
    Custom tag set — only extract ``Name``:
 
    >>> result4 = annotate_from_gff({'pam_start': 250, 'pam_end': 253}, gff,
    ...                              seqid='chr1', tags=['Name'])
    >>> 'ann_Name' in result4
    True
    >>> 'ann_locus_tag' in result4
    False
    
    """
    
    tags = tags or _TAGS
    pam_loc = (
        sgRNA['pam_start'] 
        + abs(sgRNA['pam_start'] 
        - sgRNA['pam_end']) // 2
    )

    results = gff_data.lookup_at(seqid, pam_loc)
    if not results:
        print_err(f"Warning: locus {pam_loc} on {seqid!r} not covered by GFF. Skipping annotation.")
        return sgRNA
    
    annotation_matches = results[0]
    # bioino 0.0.3 already bakes _up-/_down- into locus_tag for intergenic positions
    for tag in tags:
        try:
            sgRNA[f'ann_{tag}'] = annotation_matches.attributes[tag]
        except KeyError:
            pass

    sgRNA["pam_offset"] = annotation_matches.attributes["offset"]
    sgRNA.update({
        f"ann_{header}": val 
        for header, val in asdict(annotation_matches.columns).items()
    })
    return sgRNA