Source code for crispio.features

from typing import Any, Callable, Dict, Iterable, List, Tuple, Optional, Union

from itertools import dropwhile, takewhile

from bioino import GffLine
from streq import correlation, gc_content, purine_content, reverse_complement

_FLOAT_FORMAT = '.3f'


def _format_float(f: Callable[[Any], float]) -> Callable[[Any], float]:
    def _f(*args, **kwargs):
        return format(f(*args, **kwargs), _FLOAT_FORMAT)
    return _f


@_format_float
def _autocorrelation(gff: GffLine, 
                     attribute: str,
                     scaffold: Optional[str] = None) -> float:
    seq = gff.attributes[attribute]
    return correlation(
        seq, 
        scaffold or reverse_complement(seq),
        wobble=True,
    )


def _pam_leading_n(
    gff: GffLine,
    scaffold: Optional[str] = None
) -> str:

    x = takewhile(
        lambda x: x[0] == 'N', 
        zip(
            gff.attributes['pam_search'], 
            gff.attributes['pam_sequence'],
        ),
    )
    return "".join(item[1] for item in x)


def _pam_defined(
    gff: GffLine, 
    scaffold: Optional[str] = None
) -> str:

    x = dropwhile(
        lambda x: x[0] == 'N', 
        zip(gff.attributes['pam_search'], 
        gff.attributes['pam_sequence'],
    ))
    return "".join(item[1] for item in x)


_FEATURIZERS = {
    'on_nontemplate_strand': lambda gff, _: gff.columns.strand != gff.attributes['ann_strand'],
    'context_up2': lambda gff, _: gff.attributes['guide_context_up'][-2:],
    'context_down2': lambda gff, _: gff.attributes['guide_context_down'][:2],
    'context_up_autocorr': lambda gff, _: _autocorrelation(gff, 'guide_context_up'),
    'pam_n': _pam_leading_n,
    'pam_def': _pam_defined,
    'pam_gc': lambda gff, _: _format_float(gc_content)(gff.attributes['pam_sequence']),
    'pam_autocorr': lambda gff, _: _autocorrelation(gff, 'pam_sequence'),
    'pam_scaff_corr': lambda gff, scaffold: _autocorrelation(gff, 'pam_sequence', scaffold),
    'guide_purine': lambda gff, _: _format_float(purine_content)(gff.attributes['guide_sequence']),
    'guide_gc': lambda gff, _: _format_float(gc_content)(gff.attributes['guide_sequence']),
    'seed_seq': lambda gff, _: gff.attributes['guide_sequence'][-5:],
    'guide_start3': lambda gff, _: gff.attributes['guide_sequence'][:3],
    'guide_end3': lambda gff, _: gff.attributes['guide_sequence'][-3:],
    'guide_autocorr': lambda gff, _: _autocorrelation(gff, 'guide_sequence'),
    'guide_scaff_corr': lambda gff, scaffold: _autocorrelation(gff, 'guide_sequence', scaffold),
}


[docs] def get_features() -> List[str]: """Get the list of available features.""" return list(_FEATURIZERS)
[docs] def featurize( gff: GffLine, features: Optional[Union[str, Iterable[str]]] = None, scaffold: Optional[str] = None ) -> Union[int, str, Dict[str, Union[int, str]]]: """Featurize a guide RNA represented by a `bioino.GffLine`. Depending on the feature to be calculated, the GFF should have attributes 'pam_sequence', 'guide_sequence', 'guide_context_up', 'guide_context_down', and 'ann_strand'. Parameters ---------- gff : bioino.GffLine Input guide RNA with additional attributes. features : str or list of str, optional The names of the features to be calculated. Default: calculate all. scaffold : str, optional Guide scaffold. Required for some features. If `features` is the default, scaffold must be provided. Returns ------- dict, float, or str If `features` is a string, then returns the value of the feature. If it is a list, then returns a dictionary mapping feature names to values. Raises ------ KeyError If any `features` are not supported. ValueError If `features` is neither a string nor iterable. AttributeError If `features` is default but `scaffold` is not provided. Examples -------- Build a representative GffLine (attributes required by the feature set): >>> from bioino import GffLine >>> gff_line = GffLine( ... ['chr1', 'crispio', 'protospacer', 1, 20, '.', '+', '.'], ... { ... 'guide_sequence': 'ATCGATCGATCGATCGATCG', ... 'pam_sequence': 'CGG', ... 'pam_search': 'NGG', ... 'guide_context_up': 'AAAAAAAAAAAAAAAAAACC', ... 'guide_context_down':'TTTTTTTTTTTTTTTTTTGG', ... 'ann_strand': '-', ... }, ... ) Single feature by name — returns the raw value (not wrapped in a dict): >>> from crispio.features import featurize >>> featurize(gff_line, 'guide_gc') '0.500' >>> featurize(gff_line, 'guide_purine') '0.500' >>> featurize(gff_line, 'seed_seq') 'GATCG' >>> featurize(gff_line, 'guide_start3') 'ATC' >>> featurize(gff_line, 'guide_end3') 'TCG' >>> featurize(gff_line, 'pam_gc') '1.000' >>> featurize(gff_line, 'pam_n') 'C' >>> featurize(gff_line, 'pam_def') 'GG' >>> featurize(gff_line, 'context_up2') 'CC' >>> featurize(gff_line, 'context_down2') 'TT' >>> featurize(gff_line, 'on_nontemplate_strand') True >>> featurize(gff_line, 'guide_autocorr') '8.223' >>> featurize(gff_line, 'pam_autocorr') '1.500' List of features — returns a ``feat_``-prefixed dict: >>> featurize(gff_line, features=['guide_gc', 'seed_seq', 'guide_start3']) {'feat_guide_gc': '0.500', 'feat_seed_seq': 'GATCG', 'feat_guide_start3': 'ATC'} All features require a scaffold **sequence** (not a name string). Retrieve it from ``crispio.utils.sequences``: >>> from crispio.utils import sequences >>> scaffold_seq = sequences.scaffolds['Sth1'] >>> result = featurize(gff_line, scaffold=scaffold_seq) >>> sorted(result.keys()) # doctest: +NORMALIZE_WHITESPACE ['feat_context_down2', 'feat_context_up2', 'feat_context_up_autocorr', 'feat_guide_autocorr', 'feat_guide_end3', 'feat_guide_gc', 'feat_guide_purine', 'feat_guide_scaff_corr', 'feat_guide_start3', 'feat_on_nontemplate_strand', 'feat_pam_autocorr', 'feat_pam_def', 'feat_pam_gc', 'feat_pam_n', 'feat_pam_scaff_corr', 'feat_seed_seq'] >>> result['feat_guide_scaff_corr'] '9.770' >>> result['feat_pam_scaff_corr'] '2.667' Calling without a scaffold when computing all features raises ``AttributeError``, not ``TypeError``: >>> featurize(gff_line) Traceback (most recent call last): ... AttributeError: Scaffold must be provided to calculate all features. Unknown feature name raises ``KeyError``: >>> featurize(gff_line, 'not_a_feature') Traceback (most recent call last): ... KeyError: 'not_a_feature' """ if features is None: features = get_features() if scaffold is None: raise AttributeError("Scaffold must be provided to calculate all features.") if isinstance(features, str): return _FEATURIZERS[features](gff, scaffold) elif isinstance(features, Iterable): return {f"feat_{feature}": _FEATURIZERS[feature](gff, scaffold) for feature in features} else: raise ValueError(f"Requested feature {features} is not a string or iterable.")
[docs] def get_context( pam_start: int, pam_end: int, guide_start: int, guide_end: int, genome: str, reverse: bool, extra_bases: int = 20 ) -> Tuple[str, str]: """Get surrounding sequence. Examples ======== Use a genome with visually distinct regions to make direction clear: ``AAAA|CCCC|GGGG|TTTT|ACGT|TGCA`` (blocks of 4, 24 bp total) >>> genome = 'AAAA' + 'CCCC' + 'GGGG' + 'TTTT' + 'ACGT' + 'TGCA' Forward strand — guide at [4:8], PAM at [8:12], context window of 4 nt: upstream context is the 4 nt *before* the guide; downstream is the 4 nt *after* the PAM: >>> get_context(pam_start=8, pam_end=12, ... guide_start=4, guide_end=8, ... genome=genome, reverse=False, extra_bases=4) ('TTTT', 'AAAA') Reverse strand — PAM at [4:8] (on forward), guide at [8:12]; context is reverse-complemented and directions are flipped: >>> get_context(pam_start=4, pam_end=8, ... guide_start=8, guide_end=12, ... genome=genome, reverse=True, extra_bases=4) ('TTTT', 'AAAA') Context window at the right edge of the genome is truncated gracefully — Python slice semantics give an empty string rather than an error: >>> get_context(pam_start=20, pam_end=24, ... guide_start=16, guide_end=20, ... genome=genome, reverse=False, extra_bases=4) ('', 'TTTT') """ if not reverse: guide_down = genome[pam_end:(pam_end + extra_bases)] guide_up = genome[(guide_start - extra_bases):guide_start] else: guide_down = reverse_complement(genome[(pam_start - extra_bases):pam_start]) guide_up = reverse_complement(genome[guide_end:(guide_end + extra_bases)]) return guide_down, guide_up