参考这篇文章的有意思的分析脚本~
如何严格定义单拷贝基因的类型,值得思考。
Insights into angiosperm evolution, floral development and chemical biosynthesis from the Aristolochia fimbriata genome
https://github.com/yihenghu/Aristolochia_fimbriata_genome_analysis #脚本github地址
orthoMCL分析
get_coalescent_codon.py 从单基因第一和第二密码子位置、第三密码子位置的拼接比对中估计的基于共祖模型的系统树。
get_concatenated_codon.py 从所有基因的第一和第二密码子位置、第三密码子位置的拼接比对中估计的基于拼接模型的系统树。
01 trictly_single_copy
get_strictly_single_copy_orthogroup.py 这个脚本用于从OrthoMCL中选择严格的单拷贝直系同源组。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
import collections
# read orthogroup of orthoMCL or orthoFinder
def readGroups(groups):
orthogroups = {}
with open(groups, "r") as OGs:
for og in OGs:
if not og.startswith("\t"):
orthogroups0 = re.split(r'\s*', re.sub(',', ' ', og.rstrip()))
orthogroups[orthogroups0[0]] = orthogroups0[1:]
return orthogroups # {OG1:[geneA1,geneB1,geneC1],OG2:[geneA2,geneB2]}
# stat orthogroup
def statGroups(orthogroups):
orthogroups_stat = {}
for og_id in orthogroups.keys():
species = re.findall(r"(\w+)\|", "\t".join(orthogroups[og_id]))
species_copy = dict(collections.Counter(species))
orthogroups_stat[og_id] = species_copy
return orthogroups_stat
# multiple of list
def mult(list0):
result = 1
for i in list0:
result *= i
return result
def main():
orthogroups = readGroups(sys.argv[1])
orthogroups_stat = statGroups(orthogroups)
# select "Pso==2,Pni==2", and other species are single copy
for og, stats in orthogroups_stat.items():
if len(stats) == 22:
if stats["Pso"] <= 2 and stats["Pni"] <= 2:
sp_num = {}
for sp, number in stats.items():
if sp == "Pso" or sp == "Pni":
pass
else:
if number == 1:
sp_num[sp] = number
if sum(sp_num.values()) == 20 and mult(sp_num.values()) == 1:
print og
if __name__ == "__main__":
main()
02 mostly_single_copy
get_mostly_single_copy_orthogroup.py 这个脚本用于从OrthoMCL中选择主要是单拷贝的直系同源组。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
import collections
# read orthogroup of orthoMCL
def readGroups(groups):
orthogroups = {}
with open(groups, "r") as OGs:
for og in OGs:
if not og.startswith("\t"):
orthogroups0 = re.split(r'\s*', re.sub(',', ' ', og.rstrip()))
orthogroups[orthogroups0[0]] = orthogroups0[1:]
return orthogroups # {OG1:[geneA1,geneB1,geneC1],OG2:[geneA2,geneB2]}
# stat orthogroup
def statGroups(orthogroups):
orthogroups_stat = {}
for og_id in orthogroups.keys():
species = re.findall(r"(\w+)\|", "\t".join(orthogroups[og_id]))
species_copy = dict(collections.Counter(species))
orthogroups_stat[og_id] = species_copy
return orthogroups_stat
# multiple of list
def mult(list0):
result = 1
for i in list0:
result *= i
return result
# stat monocots, eudicots, magnoliids, basal angiosperm per OGs cotain number of genes
division = {
"Smo": "Out",
"Pab": "Out",
"Gbi": "Out",
"Atr": "Out",
"Nad": "Out",
"Spo": "Mon",
"Peq": "Mon",
"Mac": "Mon",
"Sbi": "Mon",
"Osa": "Mon",
"Afi": "Mog",
"Cmi": "Mog",
"Lir": "Mog",
"Pni": "Mog",
"Pam": "Mog",
"Aco": "Eud",
"Pso": "Eud",
"Nnu": "Eud",
"Vvi": "Eud",
"Sly": "Eud",
"Ptr": "Eud",
"Ath": "Eud"}
def main():
orthogroups = readGroups(sys.argv[1])
orthogroups_stat = statGroups(orthogroups)
new_orthogroups_stat = {}
for og,stats in orthogroups_stat.items():
new_stats = {}
for sp in stats:
new_stats[(sp, division[sp])] = stats[sp]
new_orthogroups_stat[og] = new_stats
for og, stats in new_orthogroups_stat.items():
out, mon, mog, eud = 0, 0, 0, 0
out0, mon0, mog0, eud0 = 0, 0, 0, 0
for sp_di in stats:
if sp_di[1] == "Out":
if stats[sp_di] == 1:
out += 1
else:
out0 += 1
elif sp_di[1] == "Mon":
if stats[sp_di] == 1:
mon += 1
else:
mon0 += 1
elif sp_di[1] == "Mog":
if sp_di[0] == "Pni":
if 0 < stats[sp_di] <= 2:
mog += 1
else:
mog0 += 1
else:
if stats[sp_di] == 1:
mog += 1
else:
mog0 += 1
elif sp_di[1] == "Eud":
if sp_di[0] == "Pso":
if 0 < stats[sp_di] <= 2:
eud += 1
else:
eud0 += 1
else:
if stats[sp_di] == 1:
eud += 1
else:
eud0 += 1
if out0 == 0 and mon0 == 0 and mog0 == 0 and eud0 == 0:
if out >= 3 and mon >= 3 and mog >= 3 and eud >= 4:
print og
if __name__ == "__main__":
main()
get_supermatrix_from_MSA_for_low_copy.py 这个脚本用于从OrthoMCL中获取基于拼接比对的超级矩阵(supermatrix)。