Methods – Matt McGuffie – Bioinformatics 2019

Methods

     I first constructed a manually-curated list of over 1,200 features commonly found in engineered plasmids and I created a local BLAST database of these features. I then wrote a python script which takes an inputted DNA sequence or FASTA file and BLASTs the entire sequence against curated database. Many of the hits are redundant given the nature of the manually curated list and engineered parts in general, so I filtered the results to give proper annotations. During the filtering process, I also located fragments of these engineered parts, which I define as having at least 90% identity, but less than 95% match length (these values were empirically determined to give accurate results).

     Using a database of nearly 20,000 engineered plasmid sequences provided by Addgene, I then applied this pipeline to those plasmids, quantifying the amount of “junk” contained within these plasmids. I only analyzed CDSs, because these sequences are simple to know where they stop and start. Other sequences such as ncRNA-based origins and promoters have boundaries that are much harder to define, as often they can contain UP elements, or other features that make the start and stop positions ambiguous.

Initialization

import glob
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import json
import subprocess
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Alphabet import generic_dna
import time
from tempfile import NamedTemporaryFile


import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go

def bash(inCommand, autoOutfile = True):
    if autoOutfile == True:
        tmp = NamedTemporaryFile()
        subprocess.call(inCommand +' > '+tmp.name, shell=True)
        f = open(tmp.name,'r')
        tmp.close()
        return(f.read())
    else:
        subprocess.call(inCommand, shell=True)  
def BLAST(seq, db='nr_db', BLASTtype="p", flags = 'sseqid pident qstart qend sseq slen send sstart'): 
    query = NamedTemporaryFile()
    SeqIO.write(SeqRecord(Seq(seq), id="temp"), query.name, "fasta")
    tmp = NamedTemporaryFile()
    bash(
        'blast'+BLASTtype+' -query ' + query.name + ' -out ' + tmp.name +
        ' -db /Users/mattmcguffie/database/BLAST_dbs/' + db +
        ' -max_target_seqs 20000 -outfmt "6 '+flags+'"')
    with open(tmp.name, "r") as file_handle:  #opens BLAST file
        align = file_handle.readlines()
        
    tmp.close() 
    query.close() 
    return align
#loads all plasmids in db
import re
with open("/Users/mattmcguffie/database/addgene-plasmids-sequences-2018-10-08.json", "r") as file_handle:
    plasmid = json.loads(file_handle.read())['plasmids'] #this takes ~5+ sec to load
#loads just the plasmids that have full seqs
fullPlasSeqList=[]
for plas in plasmid:
    pubfull=plas['sequences']['public_addgene_full_sequences']
    if len(pubfull)==1:
        if len(re.sub(r"[atcgATCG]", "", pubfull[0])) == 0:
            fullPlasSeqList.append(plas)
len(fullPlasSeqList)
20250

make BLAST db

import glob
snapGene=[]
snapgeneList=[]
for infile_loc in glob.glob('/Users/mattmcguffie/database/standard_features_snapgene/*.gb'):
    with open(infile_loc,'r') as file_handle:
        record_dict = SeqIO.to_dict(SeqIO.parse(file_handle, 'gb'))
        ele = record_dict[list(record_dict.keys())[0]]
        
        feat=ele.features[1]
        featureSeq = ele.seq[feat.location.start:feat.location.end]
        if feat.location.strand == -1:
            featureSeq = featureSeq.reverse_complement()
        label=infile_loc.split("/")[-1].replace(" ","_")
        Type = ele.features[1].type
        snapgeneList.append(SeqRecord(seq=featureSeq,id=f"{label}|{Type}",name='Exported', description= ""))
with open("/Users/mattmcguffie/database/snapgene_feature_list_w_types.fasta", "w") as handle:
    SeqIO.write(snapgeneList, handle, "fasta")
bash('makeblastdb -in /Users/mattmcguffie/database/snapgene_feature_list_w_types.fasta -out /Users/mattmcguffie/database/BLAST_dbs/snapgene_feature_list_w_types_db -dbtype nucl -parse_seqids')

'\n\nBuilding a new DB, current time: 04/29/2019 17:13:15\nNew DB name:   /Users/mattmcguffie/database/BLAST_dbs/snapgene_feature_list_w_types_db\nNew DB title:  /Users/mattmcguffie/database/snapgene_feature_list_w_types.fasta\nSequence type: Nucleotide\nDeleted existing Nucleotide BLAST database named /Users/mattmcguffie/database/BLAST_dbs/snapgene_feature_list_w_types_db\nKeep MBits: T\nMaximum file size: 1000000000B\nAdding sequences from FASTA; added 1237 sequences in 0.10682 seconds.\n'

annotating pipeline

def annotate(fileloc, write=True):
    record=list(SeqIO.parse(fileloc, "fasta"))[0]
    query=str(record.seq)
    
    hits = BLAST(query, db ="snapgene_feature_list_w_types_db",BLASTtype="n",flags='qstart qend sseqid sframe pident slen sseq')
    record.name=fileloc.split("/")[-1].split(".")[0]
    record.seq.alphabet=generic_dna
    record.annotations["topology"] = "circular"

    seqSpace=[[] for i in range(len(query))]
    for hit in hits:

        splitAlign=hit.split()

        qstart = int(splitAlign[0])-1
        qend = int(splitAlign[1])-1
        sseqid = splitAlign[2]
        sframe = int(splitAlign[3])
        pident = float(splitAlign[4])
        slen = int(splitAlign[5])
        sseq = splitAlign[6]
        percmatch = round((len(sseq)/slen)*100,3)
        
        Type=""
        misc = ""
        
        partType=sseqid.split("|")[1]
        sseqid=sseqid.split("|")[0]
        name = sseqid.split(".gb")[0].replace("_"," ")
        
        #-3/+3 gives it a little wiggle room -- a hack
        occupiedSpace=set_intersection(seqSpace[qstart+3:qend-2])
                        
        if not occupiedSpace: 
            for i in range(qstart,qend+1):
                seqSpace[i].append((sseqid,sframe,pident,percmatch))

            if pident>=90: #filters out all other hits                
                if percmatch < 95: #length of match
                    Type="Fragment"
                else:
                    Type=str(partType)
                record.features.append(SeqFeature(FeatureLocation(qstart, qend), type=Type,qualifiers={"label": name,"identity":pident,"match length":percmatch, "Other:":partType}, strand = sframe))

    #manually remove a known artifact/edge case
    #RSF is related to ColE1 and is a false positive
    #eventually need a better solution
    rmList=[]
    for feat in record.features:
        if feat.qualifiers["label"] == "RSF ori":
            if feat.qualifiers["match length"] < 90:
                rmList.append(feat)

    record.features=[ele for ele in record.features if ele not in rmList]
    
    if write==True:
        outfileloc=f"/Users/mattmcguffie/Desktop/{fileloc.split('/')[-1].split('.fa')[0]}.gbk"
        with open(outfileloc, "w") as handle:
            SeqIO.write(record, handle, "genbank")
        print("written")
            
    return(record)
CDSfragments=[]
print(len(fullPlasSeqList))
for i in range(len(fullPlasSeqList)):
    #print(f'https://www.addgene.com/{fullPlasSeqList[i]["id"]}')
    seq=fullPlasSeqList[i]['sequences']['public_addgene_full_sequences'][0]
    record=annotate(seq,write=False)
    c=[]
    for feat in record.features:
        if feat.type == 'Fragment':
            if feat.qualifiers['Other:'] == "CDS":
                c.append(feat.qualifiers["label"])
    CDSfragments.append(c)
print(f'{Counter([str(ele) for ele in CDSfragments])["[]"]} plasmids with NO fragmented CDS')
1260 plasmids with NO fragmented CDS
labels = ['At least one fragmented CDS','No fragments']
values = [len(CDSfragments)-1260,1260]
colors = ['#f9a557', '#808080']
layout = go.Layout(
    autosize=False,
    width=850,
    height=500,
    title="Number of plasmids with protein fragments",
    titlefont= {
    "size": 24
    }
)

trace = go.Pie(labels=labels, values=values, hoverinfo='label+percent', textinfo='value',  marker=dict(colors=colors, line=dict(color='#000000', width=2)))

fig = {
    'data': [trace],
    'layout': layout,
}

py.iplot(fig)
totList=[]
for ele in CDSfragments:
    totList.extend(ele)
topFrags=Counter(totList).most_common()[:20]
topFrags.reverse()
data = [
    go.Bar(
        y=[ele[0] for ele in topFrags],
        x=[ele[1] for ele in topFrags],
        orientation = 'h',
        #text=[ele[1] for ele in topFrags],
        #textposition = 'outside',
        #textfont=dict(color="black",size=35),
        marker=dict(
            color='#f9a557'
        ),
        opacity=0.8
    )
]

layout = go.Layout(
    autosize=False,
    width=850,
    height=500,
    title="Top Protein Fragments in Addgene plasmid repository",
    titlefont= {
    "size": 24
    },
    xaxis=dict(
        title='Number of Fragmented Proteins',
        titlefont=dict(
            size=18,
        ),
        type='log',
        autorange=True
    ),
     margin=go.layout.Margin(
        t=60,
        l=145,
        b=55
    ),
)

fig = {
    'data': data,
    'layout': layout,
}
py.iplot(fig, config={'displayModeBar': False})