'''Nick Elder 3/17/2016 Lab Methods in Genomics This script removes duplicate GO IDs from contigs and then outputs contigs and GO IDs to a new file. The initial file can be generated by copying and pasting into a plain text document from an Excel file. A second file is created of all GO IDs, each listedon its own line. This can be pasted into an Excel file for futher analysis The first file requested is the only one that you have to create prior to running the script. The second and third files are written by the script, but the user has to provide a name. The script will write over any document containing the same name. ''' # Enter input and output file names filename = raw_input('Enter file name (ending with .txt) that contains gene names and GO IDs: ') filename2 = raw_input('Enter desired file output name (ending with .txt) that will contain gene names and non-repeating GO IDs: ') filename3 = raw_input('Enter desired file output name (ending with .txt) that will contain all listed GO IDs: ') #opens the file with open (filename, 'rU') as badfile: badlist = badfile.readlines() # creates a list of contigs and GO IDs contig_list = [] GO_list = [] for contig in badlist: contents = contig.strip().split('\t') contig_list.append(contents[0]) GO_list.append(contents[1]) num_contigs = len(contig_list) #removes redundant IDs fixed_GO = [] for terms in GO_list: if terms == 'X': fixed_GO.append(['X']) else: new_GOs = [] GOs = terms.split(', ') for go in GOs: if go not in new_GOs: new_GOs.append(go) fixed_GO.append(new_GOs) #recombines the contigs and ID's and writes to a new file with open (filename2, 'w') as newfile: for i in range(num_contigs): newfile.write(contig_list[i] + '\t' + ', '.join(fixed_GO[i]) + '\n') # make the GO ID's be listed, one per line # opens the new file with non-redundant GO IDs with open (filename2, 'rU') as goodfile: goodlist = goodfile.readlines() GO_list = [] for contig in goodlist: contents = contig.strip().split('\t') GO_list.append(contents[1]) while 'X' in GO_list: GO_list.remove('X') # makes every GO ID be on its own line so that it is easily copied and pasted # into an excel document for counting and sorting GOs = ', '.join(GO_list) things = GOs.split(', ') with open (filename3, 'w') as newfile: for thing in things: newfile.write(thing + '\n')