# -*- coding: utf-8 -*- """ PLANTALIGDB AUTOMATIC WORKFLOW 09-10-2018 """ from Bio import Entrez from Bio import SeqIO from Bio import SeqUtils from Bio import AlignIO from Bio.Align import AlignInfo from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet import generic_dna from Bio.Alphabet import IUPAC, Gapped #from cogent.core.profile import Profile #from cogent import LoadSeqs, RNA from os import listdir import time import subprocess import os from amas import AMAS import sqlite3 import itertools PlantAligDB_alignments={} class PlantAligDB_workflow: def _init_(self): pass def search_PlantAligDB_families_Genbank(self,region_to_search): print ("Global module to read data from refseq genbank PlantAligDB families and regions") Entrez.email = "example@gmail.com" # Always tell NCBI who you are #Extract mtDNA record information terms_to_search=listdir("alignments/"+region_to_search) #terms_to_search2=listdir("alignments/psbA-trnH/") #terms_to_search3=listdir("alignments/ITS/") file_data=terms_to_search for data in file_data: file_data_split=data.split(".") region_family=file_data_split[0] region_family=region_family.split("_") region=region_family[0] family=region_family[1] print (region) print (family) dir_files="workflow_script_alignments/"+region+"/" genbank_files=listdir(dir_files) files_saved=region+"_"+family+".fas" if files_saved not in genbank_files: region_to_search2=region_to_search.split("-") region_to_search2=region_to_search.split("_") if len(region_to_search2)==2: term_search =family+"[ORGN] and "+ region_to_search2[0] +" and "+region_to_search2[1] if len(region_to_search2)==1: term_search =family+"[ORGN] and "+ region_to_search2[0] print (term_search) handle = Entrez.esearch(db="nuccore", retmax=400, term=term_search) record_search=Entrez.read(handle,validate=False) record_ids=record_search['IdList'] time.sleep(0.5) for id in record_ids: handle = Entrez.efetch(db="nuccore", id=str(id), retmode="xml") record=Entrez.read(handle,validate=False) #print (record[0].keys()) time.sleep(0.5) if "GBSeq_sequence" in record[0] and len(record[0]["GBSeq_sequence"])<10000: record_sequence = str(record[0]["GBSeq_sequence"]) record_organism = record[0]["GBSeq_organism"] record_taxonomy = record[0]["GBSeq_taxonomy"] record_features = record[0]["GBSeq_feature-table"] record_locus=record[0]["GBSeq_definition"] record_paccession=record[0]["GBSeq_locus"] record_strandedness=record[0]["GBSeq_strandedness"] if family in record_taxonomy: record_taxonomy=family if record_organism not in PlantAligDB_alignments: print ("Reading data for: "+record_organism) PlantAligDB_alignments[record_organism]=[("Sequence:",record_sequence),("Organism:",record_organism), ("Taxonomy:",record_taxonomy),("Locus:",record_locus),("Accession:",record_paccession),("Strandedness",record_strandedness)] #Read features qualifiers (genes, tRNA, and rRNA) for feature in record_features: feature_keys=feature.keys() if 'GBFeature_quals' in feature_keys: feature_quals=feature['GBFeature_quals'] feature_intervals=feature['GBFeature_intervals'] feature_value=feature_quals[0]['GBQualifier_value'] feature_intervals_keys=feature_intervals[0].keys() #print (feature_quals) #print (feature_intervals) #print (feature_value) #print (feature_intervals) if 'GBInterval_from' in feature_intervals_keys: feature_positions=(feature_intervals[0]['GBInterval_from'],feature_intervals[0]['GBInterval_to']) if (feature_value,feature_positions[0],feature_positions[1]) not in PlantAligDB_alignments[record_organism]: PlantAligDB_alignments[record_organism].append((feature_value,feature_positions[0],feature_positions[1])) save_file=open("workflow_script_alignments/"+region+"/"+region+"_"+family+".fas",'w+') for data in PlantAligDB_alignments: print ("Saving features file for family and region:"+data) sequence=PlantAligDB_alignments[data][0][1] organism=PlantAligDB_alignments[data][1][1].replace(" ","_") taxonomy=PlantAligDB_alignments[data][2][1].replace(" ","") features_data=PlantAligDB_alignments[data][3][1].replace(" ","_") features_extra=PlantAligDB_alignments[data][4][1].replace(" ","_") paccession=PlantAligDB_alignments[data][5][1] strandedness=PlantAligDB_alignments[data][6][1] save_file.write(">"+str(organism)+"-"+features_extra+"-"+str(taxonomy)+"-"+str(features_data)+"-"+str(paccession)+"-"+str(strandedness)+"\n") save_file.write(str(sequence)+"\n") PlantAligDB_alignments.clear() save_file.close() def run_mafft_initial_alignment(self,region): dir_files="workflow_script_alignments/"+region+"/" genbank_files=listdir(dir_files) root=os.getcwd() os.chdir(root+'\\workflow_script_alignments\\'+region+"\\") for file_name in genbank_files: if file_name[len(file_name)-3:len(file_name)]=="fas": file_data_split=file_name.split(".") region_family=file_data_split[0] region_family=region_family.split("_") #region=region_family[0] family=region_family[1].replace("\\r\\n","") file_to_save="aligned/"+region+"_"+family+"_aligned.fas" print ("Aligning sequences: "+file_name,file_to_save) PlantAligDB_workflow.run_mafft_alignment_fasta(file_name,file_to_save) os.chdir(root) def run_mafft_alignment(self,file_in, file_out): root=os.getcwd() #os.chdir(root+'\\workflow_script_alignments\\atpF-atpH') command_line='mafft.bat --auto --adjustdirectionaccurately --clustalout '+file_in+' > '+file_out child_process = subprocess.Popen(str(command_line),stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) alignment_data, err = child_process.communicate() print (alignment_data, err) status = child_process.wait() print (status) os.chdir(root) def run_mafft_alignment_fasta(self,file_in, file_out): root=os.getcwd() #os.chdir(root+'\\workflow_script_alignments\\atpF-atpH') command_line='mafft.bat --auto --adjustdirectionaccurately '+file_in+' > '+file_out child_process = subprocess.Popen(str(command_line),stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) alignment_data, err = child_process.communicate() print (alignment_data, err) status = child_process.wait() print (status) os.chdir(root) def remove_sequences_with_large_deletions_or_N_regions(self,region): dir_files="workflow_script_alignments/"+region+"/"+"aligned" genbank_files=listdir(dir_files) root=os.getcwd() os.chdir(root+'\\workflow_script_alignments\\'+region+"\\aligned") for file_in in genbank_files: print (file_in) if file_in[len(file_in)-3:len(file_in)]=="fas": align = AlignIO.read(file_in,"fasta") align_filtered=MultipleSeqAlignment([]) alignment_len=align.get_alignment_length() alignment_seq_number=len(align) #file_out=open("step2/"+file_in[0:len(file_in)-4]+"_step2.fas","w") if alignment_seq_number>=10: for seq_record in align: sequence_to_filter=seq_record.seq seq_name_to_filter=seq_record.name N_count=sequence_to_filter.count("N") n_count=sequence_to_filter.count("n") DEL_count=sequence_to_filter.count("-") #print ("N:"+str(N_count),"n:"+str(n_count),"Del:"+str(DEL_count)) #print ("Number of N:"+str(N_count)) #print ("Number of -:"+str(DEL_count)) #print (list(align)) if N_count>alignment_len*0.50: pass print (seq_name_to_filter+" removed from alignment") elif DEL_count>alignment_len*0.50: pass print (seq_name_to_filter+" removed from alignment") elif n_count>alignment_len*0.50: pass print (seq_name_to_filter+" removed from alignment") else: seq_to_add=SeqRecord(Seq(str(sequence_to_filter), generic_dna), id=str(seq_name_to_filter)) #print (seq_to_add) align_filtered.append(seq_to_add) #print (seq_record.seq) #print (seq_record.name) print (align_filtered) align_filtered_seq_number=len(align_filtered) if align_filtered_seq_number>10: print ("Initial alignment seqs: "+str(len(align))) print ("Filtered alignment seqs: "+str(align_filtered_seq_number)) align = AlignIO.write(align_filtered,"step2/"+file_in[0:len(file_in)-4]+"_step2.fas", "fasta") file_out_aligned="step3/"+file_in[0:len(file_in)-4]+"_step3.fas" PlantAligDB_workflow.run_mafft_alignment_fasta("step2/"+file_in[0:len(file_in)-4]+"_step2.fas",file_out_aligned) def calculate_pairwise_for_regions(self, region): dir_files="workflow_script_alignments/"+region+"/"+"aligned/step3" genbank_files=listdir(dir_files) root=os.getcwd() os.chdir(root+'\\workflow_script_alignments\\'+region+"\\aligned\\step3") for file_in in genbank_files: print (file_in) if file_in[len(file_in)-3:len(file_in)]=="fas": align = AlignIO.read(file_in,"fasta") align_seq_number=len(align) if align_seq_number>0: PlantAligDB_workflow.calculate_pairwise_identity(file_in) def calculate_pairwise_identity(self,file_in): align = AlignIO.read(file_in, "fasta") alignment_len=align.get_alignment_length() alignment_seq_number=len(align) number_identical_sites=0 number_pairwise_equal=0 total_pairwise=0 for i in range(alignment_len): site=align[:,i:i+1] #print (site) summary_align = AlignInfo.SummaryInfo(site) number_of_NT=summary_align.pos_specific_score_matrix() #print (alignment_seq_number) if "-" in number_of_NT[0]: if number_of_NT[0]["-"]>=alignment_seq_number*1: del number_of_NT[0]["-"] #print (number_of_NT[0]) if len(number_of_NT[0])==1: number_identical_sites+=1 if len(number_of_NT[0])>1: for nucleotide in number_of_NT[0]: number_nucleotide=int(number_of_NT[0][nucleotide]) equal_NT=range(number_nucleotide) equal_combinations=list(itertools.combinations(equal_NT,2)) #print ("Number equal combinations:"+(str(equal_combinations))) number_pairwise_equal+=len(equal_combinations) total_number_pairwise=list(itertools.combinations(range(alignment_seq_number),2)) total_pairwise+=len(total_number_pairwise) pairwise_differences=total_pairwise-number_pairwise_equal percentage_identical_sites=str(number_identical_sites/alignment_len) percentage_pairwise_identity=str(number_pairwise_equal/total_pairwise) print ("\nTotal number pairwise :"+str(total_pairwise)+"/Total number equal:"+str(number_pairwise_equal)) print ("Percentage of pairwise identity:"+str(number_pairwise_equal/total_pairwise)) print ("Number of identical sites:"+str(number_identical_sites)) percentage_identical_sites=number_identical_sites/alignment_len print ("Percentage of identical sites:"+str(percentage_identical_sites)+"\n\n") return percentage_identical_sites, percentage_pairwise_identity def get_alignment_statistics_summary(self, input_file): meta_aln = AMAS.MetaAlignment(in_files=[input_file], data_type="dna",in_format="fasta", cores=1) summaries = meta_aln.get_summaries() data=meta_aln.get_taxon_summaries() print (data) print (summaries) return summaries def update_PlantAligDB_sqlite(self,region): files_to_update=listdir("workflow_script_alignments/"+region+"/"+"aligned/step3") root=os.getcwd() os.chdir(root+'\\workflow_script_alignments\\'+region+"\\aligned\\step3") PlantAligDB_tax_conn = sqlite3.connect('PlantAligDB_test.sqlite') PlantAligDB_tax_conn.execute('SELECT * FROM PlantAligDB_taxonomicgroups') #regions=["atpF-atpH","ITS","matK","psbA-trnH","rbcL","trnL-CD","trnL-GH"] file_data=files_to_update for data in file_data: file_data_split=data.split(".") region_family=file_data_split[0] region_family=region_family.split("_") region=region_family[0] family=region_family[1] if data[len(data)-3:len(data)]=="fas": align = AlignIO.read(data,"fasta") align_seq_number=len(align) if align_seq_number>0: PIS,PPI=PlantAligDB_workflow.calculate_pairwise_identity(data) print(align) print (region) print (family) #update_data = "UPDATE PlantAligDB_PIS SET `"+ region + "`='test_update_data' WHERE `Family` = '"+family+"'" update_data_tax = "UPDATE PlantAligDB_taxonomicgroups SET `"+ region + "`='"+str(align_seq_number)+"' WHERE `Family` = '"+family+"'" update_data_PIS = "UPDATE PlantAligDB_PIS SET `"+ region + "`='"+str(PIS)+"' WHERE `Family` = '"+family+"'" update_data_PPI = "UPDATE PlantAligDB_PIS SET `"+ region + "`='"+str(PPI)+"' WHERE `Family` = '"+family+"'" print (update_data_tax) PlantAligDB_tax_conn.execute(update_data_tax) PlantAligDB_tax_conn.commit() PlantAligDB_tax_conn.execute('SELECT * FROM PlantAligDB_PIS') PlantAligDB_tax_conn.execute(update_data_PIS) PlantAligDB_tax_conn.execute('SELECT * FROM PlantAligDB_PPI') PlantAligDB_tax_conn.execute(update_data_PPI) PlantAligDB_tax_conn.commit() #Running example for "atpF-atpH" Step1(get sequences), Step2 (alignment and filter), Step3 (Final alignment) PlantAligDB_workflow=PlantAligDB_workflow() #PlantAligDB_workflow.search_PlantAligDB_families_Genbank("atpF-atpH") #PlantAligDB_workflow.run_mafft_initial_alignment("atpF-atpH") #PlantAligDB_workflow.remove_sequences_with_large_deletions_or_N_regions("atpF-atpH") #PlantAligDB_workflow.calculate_pairwise_for_regions("atpF-atpH") #PlantAligDB_workflow.update_PlantAligDB_sqlite("atpF-atpH")