Source code for VAPr.annovar_output_parsing

# built-in libraries
import csv
import itertools
import logging
import sys

# third-party libraries
import myvariant

# project libraries
from VAPr.vcf_genotype_fields_parsing import VCFGenotypeParser


[docs]class AnnovarTxtParser(object): """ Class that processes an Annovar-created tab-delimited text file.""" CHR_HEADER = 'chr' START_HEADER = 'start' END_HEADER = 'end' REF_HEADER = 'ref' ALT_HEADER = 'alt' OTHERINFO_HEADER = 'otherinfo' THOU_G_2015_ALL_HEADER = '1000g2015aug_all' ESP6500_ALL_HEADER = 'esp6500siv2_all' NCI60_HEADER = 'nci60' CYTOBAND_HEADER = 'cytoband' GENOMIC_SUPERDUPS_HEADER = 'genomicsuperdups' TFBS_CONS_SITES_HEADER = 'tfbsconssites' FUNC_KNOWNGENE_HEADER = 'func_knowngene' GENE_KNOWNGENE_HEADER = 'gene_knowngene' GENEDETAIL_KNOWNGENE_HEADER = 'genedetail_knowngene' EXONICFUNC_KNOWNGENE_HEADER = 'exonicfunc_knowngene' SCORE_KEY = "Score" RAW_CHR_MT_VAL = "chrM" RAW_CHR_MT_SUFFIX_VAL = RAW_CHR_MT_VAL.replace(CHR_HEADER, "") STANDARDIZED_CHR_MT_VAL = "chrMT" STANDARDIZED_CHR_MT_SUFFIX_VAL = STANDARDIZED_CHR_MT_VAL.replace(CHR_HEADER, "") _ANNOVAR_OUTPUT_COLS = [CHR_HEADER, START_HEADER, END_HEADER, REF_HEADER, ALT_HEADER, FUNC_KNOWNGENE_HEADER, GENE_KNOWNGENE_HEADER, GENEDETAIL_KNOWNGENE_HEADER, EXONICFUNC_KNOWNGENE_HEADER, # CYTOBAND_HEADER, # GENOMIC_SUPERDUPS_HEADER, THOU_G_2015_ALL_HEADER] #, # ESP6500_ALL_HEADER, # 'cosmic70', # NCI60_HEADER, #OTHERINFO_HEADER] @staticmethod def _normalize_header(raw_headers_list): """Lower-case all header strings and replace periods with underscores. Args: raw_headers_list (List[str]): A list of the headers from the Annovar txt output file. Returns: List[str]: A list of the normalized headers, in the same order as the input list. """ normalized_headers_list = [] for curr_header in raw_headers_list: normalized_headers_list.append(curr_header.lower().replace('.', '_')) return normalized_headers_list
[docs] @classmethod def read_chunk_of_annotations_to_dicts_list(cls, annovar_txt_file_like_obj, sample_names_list, chunk_index, chunk_size): annotations_dict_per_variant_list = [] hgvsid_list = [] # read in the header line and normalize the header names reader = csv.reader(annovar_txt_file_like_obj, delimiter='\t') normed_headers_list = cls._normalize_header(next(reader)) # for each row in this chunk--which is to say, each variant for curr_line_fields_list in itertools.islice(reader, (chunk_index * chunk_size), ((chunk_index + 1) * chunk_size)): hgvs_id, annotations_dict_for_curr_variant = cls._parse_single_variant_record( normed_headers_list, curr_line_fields_list, sample_names_list) hgvsid_list.append(hgvs_id) annotations_dict_per_variant_list.append(annotations_dict_for_curr_variant) return hgvsid_list, annotations_dict_per_variant_list
@classmethod def _parse_single_variant_record(cls, normed_headers_list, curr_line_fields_list, sample_names_list): # This code assumes that the VCF-produced format string and the genotype fields string(s) for the sample(s) # will be the last fields on every line and that they will NOT all have their own headers--rather, it # assumes the last header will indicate that the rest of the fields are "other info". Here is a simplified # example: # chr start end ref alt func_knowngene otherinfo # chrM 146 146 T C upstream;downstream 1 61.74 AC=2;AF=1.00;AN=2;DP=2;FS=0.000 GT:AD:DP:GQ:PL 1/1:0,22:22:66:794,66,0 ./.:0,0 1/1:0,40:40:99:1494,119,0 # Note that the content at and after the position of the otherinfo header may list additional information # before the format string and genotype fields info (which are required to be at the end of the line); # any such extra info is ignored. # make a dictionary that pairs every named header *except* the last one with its content in this line last_field_index = len(normed_headers_list) - 1 raw_fields_dict = dict(zip(normed_headers_list[0:last_field_index], curr_line_fields_list[0:last_field_index])) # TODO: someday: perhaps stop limiting this to only the fields in ANNOVAR_OUTPUT_COLS instead of all # For only a limited subset of columns, look those columns up in raw_fields_dict; if they hold real content, # do any clean-up necessary to their values and write them into a new dict cleaned_fields_dict = {} for curr_header in cls._ANNOVAR_OUTPUT_COLS: curr_value = raw_fields_dict[curr_header] if curr_value != ".": curr_value = cls._rewrite_value_if_special_header(curr_header, curr_value) cleaned_fields_dict[curr_header] = curr_value # generate the hgvs id for this variant hgvs_id = myvariant.format_hgvs(cleaned_fields_dict[cls.CHR_HEADER], cleaned_fields_dict[cls.START_HEADER], cleaned_fields_dict[cls.REF_HEADER], cleaned_fields_dict[cls.ALT_HEADER]) # now grab the number-of-samples-plus-one-th field from the *end* of the line--this holds the format # string--and also grab a list of the number-of-samples fields from the *end* of the line--these are # the genotype fields strings for each sample. num_samples_plus_one = len(sample_names_list) + 1 format_string = curr_line_fields_list[-num_samples_plus_one] genotype_field_strings_per_sample = curr_line_fields_list[-len(sample_names_list)::] genotype_field_strings_by_sample_name = dict(zip(sample_names_list, genotype_field_strings_per_sample)) # turn the dictionary of annovar fields into a dictionary of annotations for the variant, including # nested structures containing sample-specific genotype-related info annotations_dict_for_curr_variant = AnnovarAnnotatedVariant.make_per_variant_annotation_dict( cleaned_fields_dict, hgvs_id, format_string, genotype_field_strings_by_sample_name) return hgvs_id, annotations_dict_for_curr_variant @classmethod def _rewrite_value_if_special_header(cls, format_header, field_value): result = field_value # by default, assume no special coddling needed if format_header == cls.CHR_HEADER: if field_value == cls.RAW_CHR_MT_VAL: result = cls.STANDARDIZED_CHR_MT_VAL elif format_header in [cls.THOU_G_2015_ALL_HEADER, cls.ESP6500_ALL_HEADER, cls.NCI60_HEADER]: result = float(field_value) elif format_header in [cls.START_HEADER, cls.END_HEADER]: result = int(field_value) # elif curr_header == cls.CYTOBAND_HEADER: # cytoband_data = CytoBand(curr_value) # result = cytoband_data.fill() elif format_header in [cls.GENOMIC_SUPERDUPS_HEADER, cls.TFBS_CONS_SITES_HEADER]: result = cls._parse_to_dict_with_score_key(field_value) # end checking if this is a field that needs special coddling return result @classmethod def _parse_to_dict_with_score_key(cls, delimited_str): """Parse delimited string of key/value pairs to a dictionary, with Score key's value cast to a float. Args: delimited_str (str): A string of key/value pairs, with each pair delimited from the next by a ';' and, with each pair, the key delimited from the value by an '='. A key named 'Score' must be present. Returns: dict(str, Any): Dictionary of key/value pairs from the input string; the value for the 'Score' key is cast to a float. """ key_val_pairs_as_str = delimited_str.split(";") result = dict(curr_key_val_pair_str.split("=") for curr_key_val_pair_str in key_val_pairs_as_str) result[cls.SCORE_KEY] = float(result[cls.SCORE_KEY]) return result
[docs]class AnnovarAnnotatedVariant(object): GENOTYPE_KEY = 'genotype' FILTER_PASSING_READS_COUNT_KEY = 'filter_passing_reads_count' GENOTYPE_LIKELIHOODS_KEY = 'genotype_likelihoods' SAMPLES_KEY = 'samples' SAMPLE_ID_KEY = 'sample_id' GENOTYPE_SUBCLASS_BY_CLASS_KEY = 'genotype_subclass_by_class' HGVS_ID_KEY = 'hgvs_id' ALLELE_DEPTH_KEY = 'AD' @staticmethod def _list_has_valid_content(a_list): """Return true if list has at least one non-None entry, false otherwise. Args: a_list (List[Any]): The list to evaluate for validity. Returns: bool: True if list has at least one non-None entry, false otherwise. """ is_valid = len(a_list) > 0 and not all(curr_item is None for curr_item in a_list) return is_valid
[docs] @classmethod def make_per_variant_annotation_dict(cls, fields_by_annovar_header, hgvs_id, format_string, genotype_field_strings_by_sample_name): result = fields_by_annovar_header result[cls.HGVS_ID_KEY] = hgvs_id # parse sample-level info into a list of per-sample dicts and add that list to the variant-level dict sample_specific_dicts_list = [] for curr_sample_name, genotype_fields_string_for_curr_sample in genotype_field_strings_by_sample_name.items(): sample_specific_dict = cls._make_per_sample_annotation_dict(curr_sample_name, format_string, genotype_fields_string_for_curr_sample) if sample_specific_dict is not None: sample_specific_dicts_list.append(sample_specific_dict) result[cls.SAMPLES_KEY] = sample_specific_dicts_list return result
@classmethod def _make_per_sample_annotation_dict(cls, sample_name, format_string, genotype_fields_string): if not VCFGenotypeParser.is_valid_genotype_fields_string(genotype_fields_string): return None genotype_info = VCFGenotypeParser.parse(format_string, genotype_fields_string) if genotype_info is None: return None # Always include a genotype key, EVEN IF the value for that key is None result = {cls.SAMPLE_ID_KEY: sample_name, cls.GENOTYPE_KEY: genotype_info.genotype} # for all other keys, include them only if they have meaningful values if genotype_info.genotype is not None: result[cls.GENOTYPE_SUBCLASS_BY_CLASS_KEY] = genotype_info.genotype_subclass_by_class if genotype_info.filter_passing_reads_count is not None: result[cls.FILTER_PASSING_READS_COUNT_KEY] = genotype_info.filter_passing_reads_count genotype_likelihoods_list = [i.likelihood_neg_exponent for i in genotype_info.genotype_likelihoods] if cls._list_has_valid_content(genotype_likelihoods_list): result[cls.GENOTYPE_LIKELIHOODS_KEY] = genotype_likelihoods_list unfiltered_read_depths_list = [i.unfiltered_read_counts for i in genotype_info.alleles] if cls._list_has_valid_content(unfiltered_read_depths_list): result[cls.ALLELE_DEPTH_KEY] = unfiltered_read_depths_list return result
# # Gets its own class because it is particularly pesky to parse # class CytoBand(object): # NAME_KEY = "Name" # CHROMOSOME_KEY = "Chromosome" # BAND_KEY = "Band" # REGION_KEY = "Region" # SUBBAND_KEY = "Sub_Band" # # def __init__(self, cyto_band_name): # # self.letters = set('XY') # self.name = cyto_band_name # self.processed = self.fill() # # def fill(self): # # processed = {self.NAME_KEY: self.name} # spliced = re.split('(\D+)', self.name) # \D means "any non-digit", so \D+ means 1 or more non-digit # # if any((c in self.letters) for c in self.name): # processed[self.CHROMOSOME_KEY] = spliced[1][0] # processed[self.BAND_KEY] = spliced[1][1] # else: # processed[self.CHROMOSOME_KEY] = int(spliced[0]) # processed[self.BAND_KEY] = spliced[1] # # processed[self.REGION_KEY] = spliced[2] # # if '.' in spliced: # processed[self.SUBBAND_KEY] = spliced[-1] # return processed