建议在Jupyter实践,python版本3.9
seq = 'ACGT'
# 从序列指定索引的碱基
seq[1]
# 'C'
# 序列长度
len(seq)
# 4
# 序列拼接 - 字符串
seq1 = 'AACC'
seq2 = 'GGTT'
print(seq1 + seq2)
# AACCGGTT
# 序列拼接 - 列表
seqs = ['A', 'C', 'G', 'T']
print(''.join(seqs))
# 随机产生ATCG碱基
import random
random.choice('ACGT')
# 生成DNA序列
import random
seq = ''
for _ in range(100):
seq += random.choice('ACGT')
print(seq)
# GTGTCGACACCCGAGTGTTCACAAGTGGTCTTTCGGCACTTTGCTCTGAGTGGCGCGCTTAGCTATAAGGGCAAGTATCCTAGTCATATCCCGAAGGCGT
# 生成DNA序列方法2
seq = ''.join([random.choice('ACGT') for _ in range(10)])
print(seq)
# ATGGATGTCT
seq = "ATGGATGTCT"
seq[1:3]
# 'AT'
seq[:3]
#'GAT'
seq[7:]
# 'GAC'
seq[-3:]
# 'GAC'
# 获取序列最长相同前缀
def longestCommonPrefix(s1, s2):
i = 0
while i < len(s1) and i < len(s2) and s1[i] == s2[i]:
i += 1
return s1[:i]
longestCommonPrefix('ACCATTG', 'ACCAAGTC')
# 'ACCA'
# 比较序列是否相同
def match(s1, s2):
if not len(s1) == len(s2):
return False
for i in range(0, len(s1)):
if not s1[i] == s2[i]:
return False
return True
match('ACCATTG', 'ACCATTG')
# False
# 序列字符串比较
'ACCATTG' == 'ACCATTG'
True
def Complement(seq):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
t = ''
for base in seq:
t = t + complement[base]
return t
Complement('ACCATTG')
# 'TGGTAAC'
def reverseComplement(seq):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
t = ''
for base in seq:
t = complement[base] + t
return t
reverseComplement('ACCATTG')
# 'CAATGGT'
def readGenome(filename):
genome = ''
with open(filename, 'r') as f:
for line in f:
# 忽略文件头信息
if not line[0] == '>':
genome += line.rstrip()
return genome
genome = readGenome('lambda_virus.fa')
len(genome)
# 48502
genome[:100]
# 'GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACC'
# 循环遍历方法
counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
for base in genome:
counts[base] += 1
print(counts)
# {'A': 12334, 'C': 11362, 'G': 12820, 'T': 11986}
# 调用库
import collections
collections.Counter(genome)
# Counter({'G': 12820, 'A': 12334, 'T': 11986, 'C': 11362})