#!/bin/python """ InComparaNoid: Script with different analysis methods used to compare Ensembl and Inparanoid species projections. """ #---------------------------------------------------------------------------------------------------------------------- # modules #---------------------------------------------------------------------------------------------------------------------- import os import sys import argparse import re from matplotlib_venn import venn2 from matplotlib import pyplot as plt #---------------------------------------------------------------------------------------------------------------------- # globals #---------------------------------------------------------------------------------------------------------------------- list_stats = [] dict_uniprot_map = {} dict_ens_map = {} dict_inp_map = {} COUNT_TOTAL_REF_LOCI = 0 # num of TOTAL curated reference loci used to generate orthology projections #---------------------------------------------------------------------------------------------------------------------- # functions #---------------------------------------------------------------------------------------------------------------------- #---------------------------------------------------------------------------------------------------------------------- def create_dict_uniprot_map(uniprot_substitution_path) : #---------------------------------------------------------------------------------------------------------------------- """ create reference-to-uniprot mapping dictionary """ dict_uniprot_map = {} # local # read map file, populate dict (it is possible to have a many-to-1 LOC-to-Uniprot relationship; this is ok for projection inference) UNI = open(uniprot_substitution_path) for line in UNI : cols = line.rstrip().split() loc = cols[0] uniprot = cols[1] if loc in dict_uniprot_map : # this should not occur; 1-to-many LOC-to-Uniprot relationships not allowed (only the inverse) dict_uniprot_map[loc].add(uniprot) else : dict_uniprot_map[loc] = uniprot UNI.close() return dict_uniprot_map #---------------------------------------------------------------------------------------------------------------------- def create_inp_map(inparanoid_input_path, dict_uniprot_map) : #---------------------------------------------------------------------------------------------------------------------- """ open the inparanoid file (which is already loci-filtered for curated reference set and does not require RAP-to-MSU back-conversion) and generate a 2-col mapping of PRJ to LOC loci """ dict_inp_map = {} # local ensembl orthology dict INP = open(inparanoid_input_path) for line in INP : cols = line.rstrip().split() # for now, this next step is suspended. we are accepting all transcripts: isoformic, multiple sequencing lane IDs, etc. # ignore non-canonical orthologs #if int(cols[1][-1]) > 1 : # continue os_locus = cols[0].rstrip("1").rstrip(".") # swap loc for uniprot, if specified if dict_uniprot_map : os_locus = dict_uniprot_map[os_locus] #prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0].rsplit("-",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#', '-#') prj_locus = cols[1] if os_locus in dict_inp_map : dict_inp_map[os_locus].add(prj_locus) else : dict_inp_map[os_locus] = set([prj_locus]) INP.close() return dict_inp_map #---------------------------------------------------------------------------------------------------------------------- def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_id, dict_uniprot_map, is_confident) : #---------------------------------------------------------------------------------------------------------------------- """ open the ensemble plants and rap::irgsp mapping files and generate a hash mapping of reference to projected loci where reciprocal identity is >= recip_id% and confidence is high. also pre-filter against set of curated Plant Reactome reference loci """ dict_ens_map = {} # local ensembl orthology dict dict_rap_map = {} # local MSU-RAP dict, using only filtered canonical LOC loci (lowest available: ".1" is first preference, ".2" second preference, and so on) # generate internal MSU-RAP map RAP_MAP = open(rap_map_path) for line in RAP_MAP: if line.strip() != "" : cols = line.rstrip().split() rap_id = cols[0].upper() set_loc_ids = set(cols[1].upper().split(",")) # select only the first locus, if it exists for loc_id in set_loc_ids : if loc_id != "NONE" : canonical = loc_id.split(".") dict_rap_map[rap_id] = canonical[0] break; RAP_MAP.close() #for keys, values in dict_rap_map.items() : # print(keys + " | " + values) # generate ref loci filter FILTER = open(filtering_loci_path) loci_filter = set() for line in FILTER : loci_filter.add(line.rstrip()) global COUNT_TOTAL_REF_LOCI COUNT_TOTAL_REF_LOCI = len(loci_filter) #for locus in loci_filter : # print locus ENS = open(ensembl_input_path) for line in ENS : cols = line.rstrip().split() if len(cols) == 5 : if dict_uniprot_map : # if this ens os is already in uniprot list if cols[0] in dict_uniprot_map : # get uniprot id and build into ens_map os_locus = dict_uniprot_map[cols[0]] # reciprocal identity is >= recip_id%, optional high confidence if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident : if os_locus in dict_ens_map : dict_ens_map[os_locus].add(cols[1]) else : dict_ens_map[os_locus] = set([cols[1]]) else : if cols[0] in dict_rap_map : os_locus = dict_rap_map[cols[0]] if os_locus in loci_filter : # swap loc for uniprot os_locus = dict_uniprot_map[os_locus] # reciprocal identity is >= recip_id%, optional high confidence if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident : if os_locus in dict_ens_map : dict_ens_map[os_locus].add(cols[1]) else : dict_ens_map[os_locus] = set([cols[1]]) ENS.close() return dict_ens_map #---------------------------------------------------------------------------------------------------------------------- def compare_maps(dict_cmp_map, dict_inp_map, comparison_file_path, compara_output_path, inparanoid_output_path) : #---------------------------------------------------------------------------------------------------------------------- """compare Ensembl and Inparanoid projection results""" # reference loci inp_ref_loci = len(dict_inp_map.keys()) # inparanoid ref loci cmp_ref_loci = len(dict_cmp_map.keys()) # compara ref loci set_inp_ref_loci = set(dict_inp_map.keys()) set_cmp_ref_loci = set(dict_cmp_map.keys()) inp_exc_ref_loci = len(set_inp_ref_loci - set_cmp_ref_loci) # inparanoid exclusive ref loci cmp_exc_ref_loci = len(set_cmp_ref_loci - set_inp_ref_loci) # compara exclusive ref loci union_ref_loci = len(set_inp_ref_loci | set_cmp_ref_loci) # all ref loci intersection_ref_loci = len(set_inp_ref_loci & set_cmp_ref_loci) # all shared ref loci # JP - not needed at this time; coverage over total curated set of reference loci is more useful #inp_ref_loci_coverage = '{:.2%}'.format(inp_ref_loci/float(union_ref_loci)) #cmp_ref_loci_coverage = '{:.2%}'.format(cmp_ref_loci/float(union_ref_loci)) """ projected loci (all non-unique, meaning recounted relative to each ref locus; in other words, not a collapsed set of projections that would ignore 1-to-many ref loci projections """ inp_prj_loci = sum([len(v) for v in dict_inp_map.itervalues()]) # total inparanoid projected loci cmp_prj_loci = sum([len(v) for v in dict_cmp_map.itervalues()]) # total compara projected loci # iterate over both map files and build an overlap map, for the purpose of counting inclusive and exclusive projections by locus # ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]} ref_dict = {} # build ref_dict (Compara) for ref_locus, prj_loci in dict_cmp_map.iteritems() : if ref_locus in ref_dict : ref_dict[ref_locus][0].extend(prj_loci) # add Compara projected loci in the first list slot else : ref_dict[ref_locus] = [prj_loci, []] # continue build of ref_dict (Inparanoid) for ref_locus, prj_loci in dict_inp_map.iteritems() : if ref_locus in ref_dict : ref_dict[ref_locus][1].extend(prj_loci) # put the Inparanoid projected loci in the second list slot else : ref_dict[ref_locus] = [[], prj_loci] # create an empty slot for the Compara projected loci, put the Inparanoid projected loci in the second list slot inp_exc_prj_loci = 0 # inparanoid exclusive projected loci cmp_exc_prj_loci = 0 # compara exclusive projected loci union_prj_loci = 0 # all projections, duplicates removed only at the ref locus level intersection_prj_loci = 0 # all overlapping projections # calculate common projected loci overlap sets for k, v in ref_dict.iteritems() : curr_intersection_prj_loci = len(set(v[0]) & set(v[1])) v.append(curr_intersection_prj_loci) intersection_prj_loci += curr_intersection_prj_loci union_prj_loci += len(set(v[0]) | set(v[1])) inp_exc_prj_loci += len(set(v[1]) - set(v[0])) cmp_exc_prj_loci += len(set(v[0]) - set(v[1])) # JP - not needed at this time #inp_prj_loci_coverage = '{:.2%}'.format(inp_prj_loci/float(union_prj_loci)) #cmp_prj_loci_coverage = '{:.2%}'.format(cmp_prj_loci/float(union_prj_loci)) # percent coverage of orig. curated reference loci set inp_prj_curated_loci_coverage = '{:.2%}'.format(inp_ref_loci/float(COUNT_TOTAL_REF_LOCI)) cmp_prj_curated_loci_coverage = '{:.2%}'.format(cmp_ref_loci/float(COUNT_TOTAL_REF_LOCI)) # other stats #avg_projections_per_locus #avg_overlapping_projections_per_locus # calculate avg. common projected loci overlaps #avg_prj_overlap = float(total_intersection_projections) / float(len(ref_dict)) #avg_inc_prj_overlap = float(total_union_projections) / float(inc_os) list_stats.extend([{"[Comparison Stats]": ""}, {"\n-- Reference Loci --":""}, {"Total number of curated reference loci:": str(COUNT_TOTAL_REF_LOCI)}, {"Inparanoid reference loci:": str(inp_ref_loci)}, {"Compara reference loci:": str(cmp_ref_loci)}, {"Exclusive Inparanoid reference loci:": str(inp_exc_ref_loci)}, {"Exclusive Compara reference loci:": str(cmp_exc_ref_loci)}, {"All reference loci (union):": str(union_ref_loci)}, {"All shared reference loci (intersection):": str(intersection_ref_loci)}, #{"Inparanoid reference loci coverage:": inp_ref_loci_coverage}, #{"Compara reference loci coverage:": cmp_ref_loci_coverage}, {"Inparanoid coverage of curated reference loci:": inp_prj_curated_loci_coverage}, {"Compara coverage of curated reference loci:": cmp_prj_curated_loci_coverage}, {"\n-- Projected Loci (all non-unique with respect to reference loci) --":""}, {"Inparanoid projected loci:": str(inp_prj_loci)}, {"Compara projected loci:": str(cmp_prj_loci)}, {"Inparanoid exclusive projected loci:": str(inp_exc_prj_loci)}, {"Compara exclusive projected loci:": str(cmp_exc_prj_loci)}, {"All projected loci (union):": str(union_prj_loci)}, {"All shared projected loci (intersection):": str(intersection_prj_loci)}, #{"Inparanoid projected loci coverage:": inp_prj_loci_coverage}, #{"Compara projected loci coverage:": cmp_prj_loci_coverage}, ]) for item in list_stats : for k, v in item.iteritems() : print(k + " " + v) if comparison_file_path : COMP_STATS = open(comparison_file_path, 'w') for item in list_stats : for k, v in item.iteritems() : COMP_STATS.write(k + " " + v + "\n") COMP_STATS.close() if inparanoid_output_path : INP_OUT_FILE = open(inparanoid_output_path,'w') INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w') for k, v in sorted(dict_inp_map.iteritems()) : INP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n") for projection in sorted(v) : INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n") INP_OUT_FILE.close() INP_FLAT_OUT_FILE.close() if compara_output_path : CMP_OUT_FILE = open(compara_output_path,'w') CMP_FLAT_OUT_FILE = open(compara_output_path + ".flat",'w') for k, v in sorted(dict_cmp_map.iteritems()) : CMP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n") for projection in sorted(v) : CMP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n") CMP_OUT_FILE.close() CMP_FLAT_OUT_FILE.close() return [[set_inp_ref_loci, set_cmp_ref_loci], [inp_exc_prj_loci, cmp_exc_prj_loci, intersection_prj_loci], [inp_prj_curated_loci_coverage, cmp_prj_curated_loci_coverage]] #---------------------------------------------------------------------------------------------------------------------- def write_reactome_files(dict_map, reactome_gene_protein_path, reactome_projection_path, projection_prefix) : #---------------------------------------------------------------------------------------------------------------------- """ accepts a single projection map source and outputs the appropriate reference::projection protein and gene::protein "orthopair" format files, including a projection prefix as needed """ val_set = set() # hold unique protein list REACTOME_GENE_PROTEIN_OUT_FILE = open(reactome_gene_protein_path,'w') REACTOME_PROJECTION_OUT_FILE = open(reactome_projection_path,'w') for k, v_list in sorted(dict_map.iteritems()) : REACTOME_PROJECTION_OUT_FILE.write(k + "\t" + " ".join(map(lambda x: x.replace(x, (projection_prefix if projection_prefix else "") + x), v_list)) + "\n") for v in v_list : if v not in val_set : val_set.add(v) REACTOME_GENE_PROTEIN_OUT_FILE.write(v + "\t" + (projection_prefix if projection_prefix else "") + v + "\n") REACTOME_GENE_PROTEIN_OUT_FILE.close() REACTOME_PROJECTION_OUT_FILE.close() #---------------------------------------------------------------------------------------------------------------------- def generate_venn(venn_data, list_coverage, colors, is_ref, ref_species, proj_species, reciprocal_id, confidence, venn_output_path) : #---------------------------------------------------------------------------------------------------------------------- """build and display Venn diagrams representing reference loci overlap""" plt.figure(figsize=(9, 9 if is_ref else 8)) if is_ref : intersection_loci = len(venn_data[0] & venn_data[1]) union_loci = len(venn_data[0] | venn_data[1]) inp_exc_loci = len(venn_data[0] - venn_data[1]) cmp_exc_loci = len(venn_data[1] - venn_data[0]) v = venn2(venn_data, ('Inparanoid', 'Compara')) else : intersection_loci = venn_data[2] inp_exc_loci = venn_data[0] cmp_exc_loci = venn_data[1] union_loci = inp_exc_loci + intersection_loci + cmp_exc_loci v = venn2(subsets = venn_data) v.get_patch_by_id('10').set_alpha(0.5) v.get_patch_by_id('10').set_color(colors[0]) v.get_patch_by_id('01').set_alpha(0.5) v.get_patch_by_id('01').set_color(colors[1]) v.get_patch_by_id('11').set_alpha(0.75) v.get_patch_by_id('11').set_color(colors[2]) v.get_label_by_id('10').set_text('') v.get_label_by_id('01').set_text('') v.get_label_by_id('11').set_text('') plt.annotate(str(inp_exc_loci), xy = v.get_label_by_id('10').get_position(), xytext = (0,-10), size = 'x-large', ha = 'center', textcoords = 'offset points') plt.annotate(str(cmp_exc_loci), xy = v.get_label_by_id('01').get_position(), xytext = (0,-10), size = 'x-large', ha = 'center', textcoords = 'offset points') plt.annotate(str(intersection_loci), xy = v.get_label_by_id('11').get_position(), xytext = (0,-10), size = 'x-large', ha = 'center', textcoords = 'offset points') v.get_label_by_id('A').set_text('') v.get_label_by_id('B').set_text('') plt.annotate('Inparanoid', xy = v.get_label_by_id('10').get_position(), xytext = (-30,-70), size = 'x-large', ha = 'center', textcoords = 'offset points', bbox = dict(boxstyle = 'round, pad=0.5', fc = colors[3], alpha = 0.3)) plt.annotate('Compara', xy = v.get_label_by_id('01').get_position(), xytext = (30,-70), size = 'x-large', ha = 'center', textcoords = 'offset points', bbox = dict(boxstyle = 'round, pad = 0.5', fc = colors[4], alpha = 0.3)) # show percentage coverage of curated reference loci on reference diagram only if is_ref : plt.annotate('[Coverage of curated reference loci]\nInparanoid: ' + list_coverage[0] + '\nCompara: ' + list_coverage[1], xy = v.get_label_by_id('11').get_position(), xytext = (0,-300), size = 'large', ha = 'center', va = 'bottom', textcoords = 'offset points') plt.title('[' + ref_species + ' > ' + proj_species + ']\n' + (' Reference ' if is_ref else ' Projection ') + 'loci (' + str(union_loci) + ' projected' + (' of ' + str(COUNT_TOTAL_REF_LOCI) + ' total' if is_ref else '') + ') coverage comparison\nbetween Inparanoid super-clusters and Compara orthology data,\ngiven ' + str(reciprocal_id) + '% Compara reciprocal identity' + (', high-confidence only' if confidence else '')) if venn_output_path : plt.savefig(venn_output_path + '/incomparanoid_' + ref_species.replace(' ','_') + '_2_' + proj_species.replace(' ','_') + '_recip_' + str(reciprocal_id) + ('_all_confidence' if confidence else '') + ('_reference_loci' if is_ref else '_projection_loci') + '_comparison.png', dpi=None, facecolor='w', edgecolor='w', orientation=None, papertype=None, format='png', transparent=True, bbox_inches=None, pad_inches=0.1, frameon=None) plt.show() #---------------------------------------------------------------------------------------------------------------------- # main #---------------------------------------------------------------------------------------------------------------------- # process args parser = argparse.ArgumentParser(description='Script with different analysis methods used to compare Ensembl and Inparanoid species projections.') # input settings parser.add_argument('-f', '--filtering_loci_path', help='list of curated reference loci user for filtering') parser.add_argument('-e', '--ensembl_input_path', help='ensembl compara input file') parser.add_argument('-i', '--inparanoid_input_path', help='inparanoid supercluster input file') parser.add_argument('-m', '--rap_map_path', help='MSU-RAP mapping file') parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage') parser.add_argument('-C', '--confidence_high', help='only use ensembl projections marked as high-confidence', action='store_true') parser.add_argument('-u', '--uniprot_substitution', help='file path to UniProt substitution data for reference loci') parser.add_argument('--ref_species', help='reference species') parser.add_argument('--proj_species', help='projection species') # TODO: add an "inparanoid super-cluster vs. conventional input" flag # output settings parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projected protein identifiers', default='') # e.g. 'MaizeGDB:' parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons') parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs') parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs') parser.add_argument('-g', '--generate_reactome_output', help='produce ortho_pair files required by Reactome projection inference script for specified projection source', choices=['ensembl', 'inparanoid']) parser.add_argument('-R', '--reactome_gene_protein_path', help='output file containing gene::protein mappings for Reactome projection inference') # e.g. 'zmay_gene_protein_mapping.txt' parser.add_argument('-P', '--reactome_projection_path', help='output file containing reference::projection protein mappings for Reactome inference') # e.g. 'osat_zmay_mapping.txt' parser.add_argument('-V', '--venn_diagram', help='generate Venn diagram', action='store_true') parser.add_argument('-v', '--venn_output_path', help='save Venn diagrams to this directory') args = parser.parse_args() #print args # create ref loci::UniProt map, if specified if args.uniprot_substitution : dict_uniprot_map = create_dict_uniprot_map(args.uniprot_substitution) # create projection maps if (args.inparanoid_input_path) : dict_inp_map = create_inp_map(args.inparanoid_input_path, dict_uniprot_map) if (args.ensembl_input_path) : dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0) # generate stats and output them; assumes both inparanoid and ensembl data have been provided if (args.comparison_file_path) : all_venn_data = compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path) if args.generate_reactome_output == 'ensembl' : write_reactome_files(dict_ens_map, args.reactome_gene_protein_path, args.reactome_projection_path, args.projection_prefix) if args.generate_reactome_output == 'inparanoid' : write_reactome_files(dict_inp_map, args.reactome_gene_protein_path, args.reactome_projection_path, args.projection_prefix) # NOTE: requires local matplotlib backend configuration if args.venn_diagram : generate_venn(all_venn_data[0], all_venn_data[2], ['red', 'yellow', 'orange', 'blue', 'lime'], 1, args.ref_species, args.proj_species, args.reciprocal_id, 1 if args.confidence_high else 0, args.venn_output_path) generate_venn(all_venn_data[1], all_venn_data[2], ['green', 'yellow', 'lightgreen', 'blue', 'lime'], 0, args.ref_species, args.proj_species, args.reciprocal_id, 1 if args.confidence_high else 0, args.venn_output_path) #---------------------------------------------------------------------------------------------------------------------- # end #----------------------------------------------------------------------------------------------------------------------