Skip to content

Commit

Permalink
Two new functions added.
Browse files Browse the repository at this point in the history
list_files() for list all the files in the requested folder.
join_sequencing_fragments for join together sequencing fragments.
  • Loading branch information
Mingzhang authored and Mingzhang committed May 23, 2016
1 parent 374e581 commit 04b09a6
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 110 deletions.
119 changes: 12 additions & 107 deletions Mybiotools demo.ipynb → Demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -206,65 +206,6 @@
"p53.cds_seq"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"p53seq = '''GATGGGATTGGGGTTTTCCCCTCCCATGTGCTCAAGACTGGCGCTAAAAGTTTTGAGCTTCTCAAAAGTC\n",
"TAGAGCCACCGTCCAGGGAGCAGGTAGCTGCTGGGCTCCGGGGACACTTTGCGTTCGGGCTGGGAGCGTG\n",
"CTTTCCACGACGGTGACACGCTTCCCTGGATTGGCAGCCAGACTGCCTTCCGGGTCACTGCCATGGAGGA\n",
"GCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTCAGGAAACATTTTCAGACCTATGGAAACTACTT\n",
"CCTGAAAACAACGTTCTGTCCCCCTTGCCGTCCCAAGCAATGGATGATTTGATGCTGTCCCCGGACGATA\n",
"TTGAACAATGGTTCACTGAAGACCCAGGTCCAGATGAAGCTCCCAGAATGCCAGAGGCTGCTCCCCCCGT\n",
"GGCCCCTGCACCAGCAGCTCCTACACCGGCGGCCCCTGCACCAGCCCCCTCCTGGCCCCTGTCATCTTCT\n",
"GTCCCTTCCCAGAAAACCTACCAGGGCAGCTACGGTTTCCGTCTGGGCTTCTTGCATTCTGGGACAGCCA\n",
"AGTCTGTGACTTGCACGTACTCCCCTGCCCTCAACAAGATGTTTTGCCAACTGGCCAAGACCTGCCCTGT\n",
"GCAGCTGTGGGTTGATTCCACACCCCCGCCCGGCACCCGCGTCCGCGCCATGGCCATCTACAAGCAGTCA\n",
"CAGCACATGACGGAGGTTGTGAGGCGCTGCCCCCACCATGAGCGCTGCTCAGATAGCGATGGTCTGGCCC\n",
"CTCCTCAGCATCTTATCCGAGTGGAAGGAAATTTGCGTGTGGAGTATTTGGATGACAGAAACACTTTTCG\n",
"ACATAGTGTGGTGGTGCCCTATGAGCCGCCTGAGGTTGGCTCTGACTGTACCACCATCCACTACAACTAC\n",
"ATGTGTAACAGTTCCTGCATGGGCGGCATGAACCGGAGGCCCATCCTCACCATCATCACACTGGAAGACT\n",
"CCAGTGGTAATCTACTGGGACGGAACAGCTTTGAGGTGCGTGTTTGTGCCTGTCCTGGGAGAGACCGGCG\n",
"CACAGAGGAAGAGAATCTCCGCAAGAAAGGGGAGCCTCACCACGAGCTGCCCCCAGGGAGCACTAAGCGA\n",
"GCACTGCCCAACAACACCAGCTCCTCTCCCCAGCCAAAGAAGAAACCACTGGATGGAGAATATTTCACCC\n",
"TTCAGATCCGTGGGCGTGAGCGCTTCGAGATGTTCCGAGAGCTGAATGAGGCCTTGGAACTCAAGGATGC\n",
"CCAGGCTGGGAAGGAGCCAGGGGGGAGCAGGGCTCACTCCAGCCACCTGAAGTCCAAAAAGGGTCAGTCT\n",
"ACCTCCCGCCATAAAAAACTCATGTTCAAGACAGAAGGGCCTGACTCAGACTGACATTCTCCACTTCTTG\n",
"TTCCCCACTGACAGCCTCCCACCCCCATCTCTCCCTCCCCTGCCATTTTGGGTTTTGGGTCTTTGAACCC\n",
"TTGCTTGCAATAGGTGTGCGTCAGAAGCACCCAGGACTTCCATTTGCTTTGTCCCGGGGCTCCACTGAAC\n",
"AAGTTGGCCTGCACTGGTGTTTTGTTGTGGGGAGGAGGATGGGGAGTAGGACATACCAGCTTAGATTTTA\n",
"AGGTTTTTACTGTGAGGGATGTTTGGGAGATGTAAGAAATGTTCTTGCAGTTAAGGGTTAGTTTACAATC\n",
"AGCCACATTCTAGGTAGGGGCCCACTTCACCGTACTAACCAGGGAAGCTGTCCCTCACTGTTGAATTTTC\n",
"TCTAACTTCAAGGCCCATATCTGTGAAATGCTGGCATTTGCACCTACCTCACAGAGTGCATTGTGAGGGT\n",
"TAATGAAATAATGTACATCTGGCCTTGAAACCACCTTTTATTACATGGGGTCTAGAACTTGACCCCCTTG\n",
"AGGGTGCTTGTTCCCTCTCCCTGTTGGTCGGTGGGTTGGTAGTTTCTACAGTTGGGCAGCTGGTTAGGTA\n",
"GAGGGAGTTGTCAAGTCTCTGCTGGCCCAGCCAAACCCTGTCTGACAACCTCTTGGTGAACCTTAGTACC\n",
"TAAAAGGAAATCTCACCCCATCCCACACCCTGGAGGATTTCATCTCTTGTATATGATGATCTGGATCCAC\n",
"CAAGACTTGTTTTATGCTCAGGGTCAATTTCTTTTTTCTTTTTTTTTTTTTTTTTTCTTTTTCTTTGAGA\n",
"CTGGGTCTCGCTTTGTTGCCCAGGCTGGAGTGGAGTGGCGTGATCTTGGCTTACTGCAGCCTTTGCCTCC\n",
"CCGGCTCGAGCAGTCCTGCCTCAGCCTCCGGAGTAGCTGGGACCACAGGTTCATGCCACCATGGCCAGCC\n",
"AACTTTTGCATGTTTTGTAGAGATGGGGTCTCACAGTGTTGCCCAGGCTGGTCTCAAACTCCTGGGCTCA\n",
"GGCGATCCACCTGTCTCAGCCTCCCAGAGTGCTGGGATTACAATTGTGAGCCACCACGTCCAGCTGGAAG\n",
"GGTCAACATCTTTTACATTCTGCAAGCACATCTGCATTTTCACCCCACCCTTCCCCTCCTTCTCCCTTTT\n",
"TATATCCCATTTTTATATCGATCTCTTATTTTACAATAAAACTTTGCTGCCACCTGTGTGTCTGAGGGGT\n",
"G'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"len(p53seq)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -284,7 +225,7 @@
},
"outputs": [],
"source": [
"cleaned = mb.clean_seq(p53seq)"
"mb.help_info()"
]
},
{
Expand All @@ -295,7 +236,7 @@
},
"outputs": [],
"source": [
"len(cleaned)"
"mb.uniprot_search('TP53')"
]
},
{
Expand All @@ -306,7 +247,7 @@
},
"outputs": [],
"source": [
"cleaned[:20]"
"mb.list_files('.')"
]
},
{
Expand All @@ -317,7 +258,7 @@
},
"outputs": [],
"source": [
"mb.reverse_complementory(cleaned[:20])"
"mb.list_files('.', '.pickle')"
]
},
{
Expand All @@ -328,20 +269,7 @@
},
"outputs": [],
"source": [
"mb.primers(cleaned, length = 25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fragment1 = cleaned[:900]\n",
"fragment2 = cleaned[800:1700]\n",
"fragment3 = cleaned[1600:]"
"mb.list_files('~/Desktop/a')"
]
},
{
Expand All @@ -352,21 +280,7 @@
},
"outputs": [],
"source": [
"fragment3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"primers_tp53 = []\n",
"for fragment in [fragment1, fragment2, fragment3]:\n",
" temp = mb.primers(fragment, length = 20)\n",
" primers_tp53.append(temp)"
"mb.list_files('~/Desktop/a')"
]
},
{
Expand All @@ -377,18 +291,8 @@
},
"outputs": [],
"source": [
"primers_tp53"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fasta = mb.read_fasta('./sequence.fasta')"
"import os\n",
"l = os.listdir('/Users/aayangm11b/Desktop/a')"
]
},
{
Expand All @@ -399,7 +303,7 @@
},
"outputs": [],
"source": [
"len(fasta)"
"l"
]
},
{
Expand All @@ -410,7 +314,8 @@
},
"outputs": [],
"source": [
"mb.help_info()"
"os.chdir('/Users/aayangm11b/Desktop/a')\n",
"seq = mb.join_sequecning_fragments(l)"
]
},
{
Expand All @@ -421,7 +326,7 @@
},
"outputs": [],
"source": [
"mb.uniprot_search('TP53')"
"len(seq)"
]
},
{
Expand Down
54 changes: 51 additions & 3 deletions Mybiotools.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import urllib
import requests
import sys
import os
import os.path

#instructions
#1. html5lib is required for pandas to read table from html.
Expand Down Expand Up @@ -80,14 +82,60 @@ def primers(s, length = 21, f_addon = '', r_addon = ''):
def read_fasta(path):#read a local fasta file downloaded from NCBI
with open(path) as foo:
return ''.join([item.strip() for item in foo.readlines()[1:]])


def list_files(dir_name, type=None, *args):
'''
This function is used to list all the files in the requested dir. One can also get specific file types
by defining the type parameter.
The type can a string or a collection of strings that declearing file types.
e.g. type = '.py', type = ['.txt', '.csv', '.data']
'''
if type is None:
return [path for path in os.listdir(dir_name) if os.path.isfile(path)]
return [path for path in os.listdir(dir_name) if os.path.isfile(path) and os.path.splitext(path)[1] == type]

def join_sequecning_fragments(file_list):
'''
When you send your PCR products or plasmid for sequencing, you probably get several .seq files back.
Usually, one need to align these fragments with your reference sequence individually.
join_sequencing_fragments function provide you an easy way to get things done.
Prerequisits:
1. change working directory to the folder that contains the sequencing results
2. make a file list using the list_files function with type parameter set as '.seq'
For proper use of this function, please make sure your sequencing results are aranged in order.
In other words, up stream fragments should be prior to down stream fragments.
'''
seqs = []
for each_file in file_list:
with open(each_file) as foo:
seq = foo.read().replace('\n', '')[100:]
first_N = seq.find('N')
if first_N <= 800:
seq = seq[:first_N]
else:
seq = seq[:800]
seqs.append(seq)

joined_seq = seqs[0]

for i in range(len(seqs)-1):
idx = seqs[i+1].find(seqs[i][-20:])
if idx == -1:
return 'Something wrong between %d and %d seq. Make sure your .seq files are in order.'%(i, i+1)
addon = seqs[i+1][idx+20:]
joined_seq += addon

return joined_seq


def help_info():
print(
'''
clean_seq input_seq: remove white space or numbers in the input gene or protein sequence.
reverse_complementory input_seq: get the reversed complementory sequence of the input DNA sequence
GC_content input_seq: calculate the GC content of the input sequence
*clean_seq(input_seq)*: remove white space or numbers in the input gene or protein sequence.
*reverse_complementory(input_seq)*: get the reversed complementory sequence of the input DNA sequence.
*GC_content(input_seq)*: calculate the GC content of the input sequence.
--help or -h: get help info of Mybiotools
''')

Expand Down

0 comments on commit 04b09a6

Please sign in to comment.