''' Contig to Go ID'''
 
#parameters: 'organism_to_symbol' file containing organisms and all of their related gene symbols found in
#the reference gene file from Todd Castoe, 'geneNames.txt' reference file from Todd Castoe
# 'organisms' - list of organism names of .gaf files
 
file1 = open('organism_to_symbol')
org_file = file1.readlines()
file1.close()
 
file2 = open('geneNames.txt')
contig_file = file2.readlines()
file2.close()
 
# organisms found in GAF file (any desired additional organisms should be
# added here)
organisms = ['Homo_sapiens', 'Mus_musculus', 'Bos_taurus', 'Rattus_norvegicus',
             'Danio_rerio', 'Arabidopsis_thaliana', 'Gallus_gallus',
             'Canis_familiaris', 'Drosophila_melanogaster', 'Pongo_abelii',
             'Xenopus_laevis', 'unknown_function']

 
'''Makes dictionary where keys are gene symbols and values are lists of
       GO ID #s
    '''
def make_dict(org_name):
   
    filename = org_name+'_gaf.txt'
    file1 = open(filename)
    gaf = file1.readlines() # gaf is a list of all of the lines of the gaf file
    file1.close()
   
    gaf_dict = {}
   
    for entry in gaf:
        #values is a list of all of the elements
        #in a single line of the gaf file
       
        values = entry.split('\t')
        if len(values) >1:
           
            #if condition for a new entry, generates a new list
            if values[2].upper().strip() in gaf_dict:
               
                #values[2] is gene name
                #values[4] is GO ID
                gaf_dict[values[2].upper().strip()].\
                    append(values[4].upper().strip())
                   
            #else condition for existing gene entry, appends GO ID to list       
            else:
                gaf_dict[values[2].upper().strip()] = \
                    [values[4].upper().strip()]  
               
    return gaf_dict
 
#Replaces unwanted symbols in each line
def clean_line(line):
    if "''" in line:
        line = line.replace("''", "^")
    if "'" in line:
        line = line.replace("'", "_prime")
    if '"' in line:
        line = line.replace('"', '')
    return line
   
 
output = open('organisms_to_GO_ID.txt', 'w')
output2 = open('contig_to_GO_ID.txt', 'w')
 
#writing the organism to GO ID file
for organism in organisms:
    gaf_dict = make_dict(organism)
    for line in org_file:
        if organism in line:
            cutsite = line.find('\t')
            gene_sym = line[cutsite:].strip() 
            if gene_sym in gaf_dict:
                go_ids = ''
                for go_id in gaf_dict[gene_sym]:
                    go_ids+=go_id+'\t'
                output.write(organism+'\t'+gene_sym+'\t'+go_ids+'\n')
  
    if organism == 'unknown_function':
        for gene_sym in gaf_dict:
            go_ids = ''
            for go_id in gaf_dict[gene_sym]:
                go_ids+=go_id+'\t'
            output.write(organism+'\t'+gene_sym+'\t'+go_ids+'\n')   
            output2.write('>'+clean_line(gene_sym)+'\t'+', '.join(gaf_dict[gene_sym])+'\n')
           
    # making contig to GO ID file
    for entry in contig_file:
        if organism in entry:
           
                   
            #find gene symbol
            cutsite = entry.find('_')
            cutsite2 = entry[cutsite+1:].find('_')
            gene_symbol = entry[cutsite+1:cutsite+1+cutsite2].upper().strip()
 
            if gene_symbol in gaf_dict:
                contig_name = clean_line(entry.strip())
 
                output2.write(contig_name+'\t')
                go_id_list = ', '.join(gaf_dict[gene_symbol])
                #for go_id in gaf_dict[gene_symbol]:
                   #output2.write(go_id+', ')
                output2.write(go_id_list)
                output2.write('\n')
 
 
output.close()
output2.close()
                                                           
_____________________________________________________________________________ 
''' Script to pull out organism names from contig file and provide count of each
02/1/16
'''
 
file1 = open("geneNames.txt")
contents = file1.readlines()
file1.close()
 
output = open("GO_organisms.txt", "w")
 
organisms = {}
 
for line in contents:
    if "Co" in line:
        i = -1
        k = 0
        while k < 2:
            if line[i] == "_":
                k = k + 1
            i = i - 1
        organism = line[i+2:]
       
        if organism in organisms:
            organisms[organism]+=1
        else:
            organisms[organism] = 1
 
for organism in organisms:
    #print organism[:-1]
    output.write(organism[:-1]+'\t'+str(organisms[organism])+'\n')
   
output.close()
 
_____________________________________________________________________________
 
'''Script to generate output file containing contigs, frequency and fpkm values
    -can be modified for both fed and unfed'''
 
file1 = open("intestine_no_1.genes.results")
contents1 = file1.readlines()
file1.close()
 
file2 = open("intestine_no_2.genes.results")
contents2 = file2.readlines()
file2.close()
 
file3 = open("intestine_no_3.genes.results")
contents3 = file3.readlines()
file3.close()
 
file1fed = open("intestine_fed_4.genes.results")
contents1fed = file1fed.readlines()
file1fed.close()
 
file2fed = open("intestine_fed_5.genes.results")
contents2fed = file2fed.readlines()
file2fed.close()
 
file3fed = open("intestine_fed_6.genes.results")
contents3fed = file3fed.readlines()
file3fed.close()
 
 
#output = open("contig_fpkm_freq.txt", "w")
 
output_fed = open("conting_fpkm_freq_fed.txt", "w")
 
 
dict1 = {}
contentList = [contents1, contents2, contents3]
 
contentFed = [contents1fed, contents2fed, contents3fed]
 
for content in contentFed:
 
    for line in content:
        if 'FPKM' not in line:
            values = line.strip().split('\t')
            #print values
            if values[0] not in dict1:
                dict1[values[0]] = [float(values[6]), 1]
            else:
                fpkm = float(dict1[values[0]][0])
                frequency = float(dict1[values[0]][1])
                frequencyNew = frequency + 1
                fpkmNew = fpkm + float(values[6])
                dict1[values[0]] = [fpkmNew, frequencyNew]
 
print dict1
for thing in dict1:
    output_fed.write(thing + '\t' + str(dict1[thing][0]/dict1[thing][1]) + '\t' + str(dict1[thing][1]) + '\n')
   
#output.close()
output_fed.close()
_____________________________________________________________________________                                                           
 
""" Assigns relative abundance scores to GO IDs
Authors: Dylan Maghini, Dustin Atchley, Nick Balanda
Date: April 19, 2016
 
This script calculates abundance scores for each GO ID linked to genes in the
six '.results' files. For each gene associated with a particular GO term, the
change in expression of that gene across treatments (calculated as
average of fed expression - average of unfed expression) is added to the score
for the particular GO term. The script also tracks how many genes are
associated with a particular GO term.
"""
import csv
 
# contig \t GO id list
file1 = open('../contig_to_GO_ID.txt')
genes = file1.readlines()
file1.close()
 
# opens and reads csv, turns it into list of lines (scores)
scores = []
with open('../gene_to_score.csv', 'rb') as csvfile:
    scores = [row for row in csv.reader(csvfile.read().splitlines())]
 
gene_to_GO = {} #keys are contig names, values are lists of GO IDs
GO_IDs = {} #keys are GO IDs, values are blank
 
for line in genes:
    values = line.strip().split('\t')
    gene = values[0][1:]
    ids = values[1].strip().split(', ')
    gene_to_GO[gene] = ids
   
    for goid in ids:
        if goid not in GO_IDs:
            GO_IDs[goid] = [0, 0] #[score difference, # of genes w/ that GO ID]
i = 0
for item in scores:
    if i > 0:
        gene = item[0].strip()
        difference = float(item[1])
        if gene in gene_to_GO:
            gene_GOs = gene_to_GO[gene]
            for go in gene_GOs: #for each go ID associated with that gene
                GO_IDs[go][0]+=difference #add the difference score
                GO_IDs[go][1]+=1  #increment number of genes that GO ID is in
 
           
    i+=1
 
output = open('../GO_abundance.txt', 'w')
for go in GO_IDs:
    output.write(go+'\t'+str(GO_IDs[go][0])+'\t'+str(GO_IDs[go][1])+'\n')
                                                           

_____________________________________________________________________________
'''
Script to remove duplicate names from gene name file. Script runs through
the geneNames text file and adds numbers to repeat contig names. Contigs don't
have numbers appended to them if they are unique.
Opens geneNames.txt
Writes geneNamesMod.txt (numbered duplicates)
Writes duplicatesList.txt (contig and number of times it appears)
 
Dylan Maghini, Dustin Atchley
January 28, 2016
'''
 
 
file1 = open("geneNames.txt")
contents = file1.readlines()
file1.close()
 
output = open("geneNamesMod.txt", "w")
output2 = open("duplicatesList.txt", "w")
 
duplicate_dict = {}
for line in contents:
    if "Co" in line:
        cutsite = line.find('_')
        name = line[cutsite+1:]
        if name in duplicate_dict:
            duplicate_dict[name].append(duplicate_dict[name][-1]+1)
        else:
            duplicate_dict[name]=[1]
 
for item in duplicate_dict:
    output2.write(item[:-1] + ':\t' + str(len(duplicate_dict[item]))+'\n')
output2.close()
 
for line in contents:
    if "Co" in line:
        cutsite = line.find('_')
        name = line[cutsite+1:]
        if duplicate_dict[name][-1] == 1:
            output.write(line)
        else:
            line = line[:-1]+'_'+str(duplicate_dict[name][0])+'\n'
            output.write(line)
            duplicate_dict[name].pop(0)
    else:
        output.write(line)
 
 
 
for item in duplicate_dict:
    if duplicate_dict[item]>1:
        print item, duplicate_dict[item]
       
output.close()