''' Contig to Go ID'''   #parameters: 'organism_to_symbol' file containing organisms and all of their related gene symbols found in #the reference gene file from Todd Castoe, 'geneNames.txt' reference file from Todd Castoe # 'organisms' - list of organism names of .gaf files   file1 = open('organism_to_symbol') org_file = file1.readlines() file1.close()   file2 = open('geneNames.txt') contig_file = file2.readlines() file2.close()   # organisms found in GAF file (any desired additional organisms should be # added here) organisms = ['Homo_sapiens', 'Mus_musculus', 'Bos_taurus', 'Rattus_norvegicus',              'Danio_rerio', 'Arabidopsis_thaliana', 'Gallus_gallus',              'Canis_familiaris', 'Drosophila_melanogaster', 'Pongo_abelii',              'Xenopus_laevis', 'unknown_function']   '''Makes dictionary where keys are gene symbols and values are lists of        GO ID #s     ''' def make_dict(org_name):         filename = org_name+'_gaf.txt'     file1 = open(filename)     gaf = file1.readlines() # gaf is a list of all of the lines of the gaf file     file1.close()         gaf_dict = {}         for entry in gaf:         #values is a list of all of the elements         #in a single line of the gaf file                 values = entry.split('\t')         if len(values) >1:                         #if condition for a new entry, generates a new list             if values[2].upper().strip() in gaf_dict:                                 #values[2] is gene name                 #values[4] is GO ID                 gaf_dict[values[2].upper().strip()].\                     append(values[4].upper().strip())                                 #else condition for existing gene entry, appends GO ID to list                    else:                 gaf_dict[values[2].upper().strip()] = \                     [values[4].upper().strip()]                       return gaf_dict   #Replaces unwanted symbols in each line def clean_line(line):     if "''" in line:         line = line.replace("''", "^")     if "'" in line:         line = line.replace("'", "_prime")     if '"' in line:         line = line.replace('"', '')     return line       output = open('organisms_to_GO_ID.txt', 'w') output2 = open('contig_to_GO_ID.txt', 'w')   #writing the organism to GO ID file for organism in organisms:     gaf_dict = make_dict(organism)     for line in org_file:         if organism in line:             cutsite = line.find('\t')             gene_sym = line[cutsite:].strip()              if gene_sym in gaf_dict:                 go_ids = ''                 for go_id in gaf_dict[gene_sym]:                     go_ids+=go_id+'\t'                 output.write(organism+'\t'+gene_sym+'\t'+go_ids+'\n')        if organism == 'unknown_function':         for gene_sym in gaf_dict:             go_ids = ''             for go_id in gaf_dict[gene_sym]:                 go_ids+=go_id+'\t'             output.write(organism+'\t'+gene_sym+'\t'+go_ids+'\n')                output2.write('>'+clean_line(gene_sym)+'\t'+', '.join(gaf_dict[gene_sym])+'\n')                 # making contig to GO ID file     for entry in contig_file:         if organism in entry:                                             #find gene symbol             cutsite = entry.find('_')             cutsite2 = entry[cutsite+1:].find('_')             gene_symbol = entry[cutsite+1:cutsite+1+cutsite2].upper().strip()               if gene_symbol in gaf_dict:                 contig_name = clean_line(entry.strip())                   output2.write(contig_name+'\t')                 go_id_list = ', '.join(gaf_dict[gene_symbol])                 #for go_id in gaf_dict[gene_symbol]:                    #output2.write(go_id+', ')                 output2.write(go_id_list)                 output2.write('\n')     output.close() output2.close()                                                             _____________________________________________________________________________  ''' Script to pull out organism names from contig file and provide count of each 02/1/16 '''   file1 = open("geneNames.txt") contents = file1.readlines() file1.close()   output = open("GO_organisms.txt", "w")   organisms = {}   for line in contents:     if "Co" in line:         i = -1         k = 0         while k < 2:             if line[i] == "_":                 k = k + 1             i = i - 1         organism = line[i+2:]                 if organism in organisms:             organisms[organism]+=1         else:             organisms[organism] = 1   for organism in organisms:     #print organism[:-1]     output.write(organism[:-1]+'\t'+str(organisms[organism])+'\n')     output.close()   _____________________________________________________________________________   '''Script to generate output file containing contigs, frequency and fpkm values     -can be modified for both fed and unfed'''   file1 = open("intestine_no_1.genes.results") contents1 = file1.readlines() file1.close()   file2 = open("intestine_no_2.genes.results") contents2 = file2.readlines() file2.close()   file3 = open("intestine_no_3.genes.results") contents3 = file3.readlines() file3.close()   file1fed = open("intestine_fed_4.genes.results") contents1fed = file1fed.readlines() file1fed.close()   file2fed = open("intestine_fed_5.genes.results") contents2fed = file2fed.readlines() file2fed.close()   file3fed = open("intestine_fed_6.genes.results") contents3fed = file3fed.readlines() file3fed.close()     #output = open("contig_fpkm_freq.txt", "w")   output_fed = open("conting_fpkm_freq_fed.txt", "w")     dict1 = {} contentList = [contents1, contents2, contents3]   contentFed = [contents1fed, contents2fed, contents3fed]   for content in contentFed:       for line in content:         if 'FPKM' not in line:             values = line.strip().split('\t')             #print values             if values[0] not in dict1:                 dict1[values[0]] = [float(values[6]), 1]             else:                 fpkm = float(dict1[values[0]][0])                 frequency = float(dict1[values[0]][1])                 frequencyNew = frequency + 1                 fpkmNew = fpkm + float(values[6])                 dict1[values[0]] = [fpkmNew, frequencyNew]   print dict1 for thing in dict1:     output_fed.write(thing + '\t' + str(dict1[thing][0]/dict1[thing][1]) + '\t' + str(dict1[thing][1]) + '\n')     #output.close() output_fed.close() _____________________________________________________________________________                                                              """ Assigns relative abundance scores to GO IDs Authors: Dylan Maghini, Dustin Atchley, Nick Balanda Date: April 19, 2016   This script calculates abundance scores for each GO ID linked to genes in the six '.results' files. For each gene associated with a particular GO term, the change in expression of that gene across treatments (calculated as average of fed expression - average of unfed expression) is added to the score for the particular GO term. The script also tracks how many genes are associated with a particular GO term. """ import csv   # contig \t GO id list file1 = open('../contig_to_GO_ID.txt') genes = file1.readlines() file1.close()   # opens and reads csv, turns it into list of lines (scores) scores = [] with open('../gene_to_score.csv', 'rb') as csvfile:     scores = [row for row in csv.reader(csvfile.read().splitlines())]   gene_to_GO = {} #keys are contig names, values are lists of GO IDs GO_IDs = {} #keys are GO IDs, values are blank   for line in genes:     values = line.strip().split('\t')     gene = values[0][1:]     ids = values[1].strip().split(', ')     gene_to_GO[gene] = ids         for goid in ids:         if goid not in GO_IDs:             GO_IDs[goid] = [0, 0] #[score difference, # of genes w/ that GO ID] i = 0 for item in scores:     if i > 0:         gene = item[0].strip()         difference = float(item[1])         if gene in gene_to_GO:             gene_GOs = gene_to_GO[gene]             for go in gene_GOs: #for each go ID associated with that gene                 GO_IDs[go][0]+=difference #add the difference score                 GO_IDs[go][1]+=1  #increment number of genes that GO ID is in                   i+=1   output = open('../GO_abundance.txt', 'w') for go in GO_IDs:     output.write(go+'\t'+str(GO_IDs[go][0])+'\t'+str(GO_IDs[go][1])+'\n')                                                             _____________________________________________________________________________ ''' Script to remove duplicate names from gene name file. Script runs through the geneNames text file and adds numbers to repeat contig names. Contigs don't have numbers appended to them if they are unique. Opens geneNames.txt Writes geneNamesMod.txt (numbered duplicates) Writes duplicatesList.txt (contig and number of times it appears)   Dylan Maghini, Dustin Atchley January 28, 2016 '''     file1 = open("geneNames.txt") contents = file1.readlines() file1.close()   output = open("geneNamesMod.txt", "w") output2 = open("duplicatesList.txt", "w")   duplicate_dict = {} for line in contents:     if "Co" in line:         cutsite = line.find('_')         name = line[cutsite+1:]         if name in duplicate_dict:             duplicate_dict[name].append(duplicate_dict[name][-1]+1)         else:             duplicate_dict[name]=[1]   for item in duplicate_dict:     output2.write(item[:-1] + ':\t' + str(len(duplicate_dict[item]))+'\n') output2.close()   for line in contents:     if "Co" in line:         cutsite = line.find('_')         name = line[cutsite+1:]         if duplicate_dict[name][-1] == 1:             output.write(line)         else:             line = line[:-1]+'_'+str(duplicate_dict[name][0])+'\n'             output.write(line)             duplicate_dict[name].pop(0)     else:         output.write(line)       for item in duplicate_dict:     if duplicate_dict[item]>1:         print item, duplicate_dict[item]         output.close()