EMBL文件的格式详解请阅读前面的文章:
C#,生信软件实践(02)——DNA数据库EMBL格式详解及转为FASTA格式文件的源代码https://blog.csdn.net/beijinghorn/article/details/130462070
本文的代码用于:
(1)解释 EMBL 文件并转为 C# 的类;
(2)可提取 FASTA 序列数据;
(3)可提取 FEATURES 数据;
(4)是生信工具软件 BIOG 的一部分代码;
using System;
using System.IO;
using System.Text;
using System.Linq;
using System.Drawing;
using System.Collections;
using System.Collections.Generic;
using System.Runtime.Serialization;
namespace Legal.BIOG
{
[DataContract]
public class EMBL_ELEMENT
{
[DataMember(Order = 1)]
public string Name { get; set; } = "";
[DataMember(Order = 2)]
public string Content { get; set; } = "";
public EMBL_ELEMENT(string buf)
{
Name = buf.Substring(0, 2);
Content = buf.Substring(2).Trim();
}
public EMBL_ELEMENT(string name, string content)
{
Name = name;
Content = content;
}
}
[DataContract]
public class EMBL_REFERENCE
{
[DataMember(Order = 1)]
public List<EMBL_ELEMENT> Items { get; set; } = new List<EMBL_ELEMENT>();
public void Append(string name, string content)
{
Items.Add(new EMBL_ELEMENT(name, content));
}
}
[DataContract]
public class EMBL_FEATURE
{
[DataMember(Order = 1)]
public string Name { get; set; } = "";
[DataMember(Order = 2)]
public string Lines { get; set; } = "";
public EMBL_FEATURE(string name, string lines)
{
Name = name;
Lines = lines;
}
public List<string> FeatureList
{
get
{
string[] ra = B.S2L(Lines);
return ra.ToList();
}
}
/// <summary>
/// 搜索 FEATURE 项目
/// 比如:/db_xref=
/// </summary>
/// <param name="name">db_xref</param>
/// <param name="branch">db_xref</param>
/// <returns></returns>
public string FindBranch(string name, string branch)
{
List<string> list = FeatureList;
if (Name == name)
{
foreach (string s in list)
{
if (s.StartsWith("/" + branch + "="))
{
return s.Substring(branch.Length + 2);
}
}
}
return "";
}
public string Position
{
get
{
List<string> list = FeatureList;
return (list[0].Contains("..")) ? list[0] : "";
}
}
public List<Point> PositionList
{
get
{
return Utility.PositionList(Position);
}
}
}
[DataContract]
public class EMBL_Item
{
[DataMember(Order = 1)]
public List<EMBL_ELEMENT> Descriptions { get; set; } = new List<EMBL_ELEMENT>();
[DataMember(Order = 2)]
public List<EMBL_REFERENCE> References { get; set; } = new List<EMBL_REFERENCE>();
[DataMember(Order = 3)]
public List<EMBL_FEATURE> Features { get; set; } = new List<EMBL_FEATURE>();
public string Find(string name)
{
EMBL_ELEMENT de = Descriptions.Find(t => t.Name == name);
return (de.Name == name) ? de.Content.Replace("\n", " ") : "";
}
/// <summary>
/// 提取 FASTA 序列信息
/// </summary>
public string Sequence
{
get
{
EMBL_ELEMENT sq = Descriptions.Find(t => t.Name == "SQ");
return (sq.Name == "SQ") ? (sq.Content) : "";
}
}
}
public class EMBL_File
{
public List<EMBL_Item> Items { get; set; } = new List<EMBL_Item>();
public EMBL_File(string buf)
{
try
{
string[] xlines = B.S2L(buf);
EMBL_Item item = null;
EMBL_REFERENCE rfx = null;
for (int i = 0; i < xlines.Length; i++)
{
if (xlines[i].StartsWith("ID"))
{
if (item != null) { Items.Add(item); item = null; }
item = new EMBL_Item();
item.Descriptions.Add(new EMBL_ELEMENT(xlines[i]));
continue;
}
else if (xlines[i].StartsWith("FH") || xlines[i].StartsWith("FT"))
{
string rs = Utility.ReadFeatureLines(ref i, xlines, out string kw, 2, 21);
EMBL_FEATURE ef = new EMBL_FEATURE(kw, rs);
item.Features.Add(ef);
continue;
}
else if (xlines[i].StartsWith("XX"))
{
if (rfx != null) { item.References.Add(rfx); rfx = null; }
continue;
}
else if (xlines[i].StartsWith("//"))
{
if (item != null) { Items.Add(item); item = null; }
continue;
}
else if (xlines[i].StartsWith("RN") ||
xlines[i].StartsWith("RP") ||
xlines[i].StartsWith("RA") ||
xlines[i].StartsWith("RT") ||
xlines[i].StartsWith("RL")
)
{
if (xlines[i].StartsWith("RN"))
{
if (rfx != null) { item.References.Add(rfx); }
rfx = new EMBL_REFERENCE();
}
string rs = Utility.ReadMultiLines(ref i, xlines, out string kw, 5);
rfx.Append(kw, rs);
continue;
}
else if (xlines[i].StartsWith("SQ"))
{
i++;
string rs = Utility.ReadSequenceLines(ref i, xlines, 5);
item.Descriptions.Add(new EMBL_ELEMENT("SQ", rs));
continue;
}
else
{
string rs = Utility.ReadMultiLines(ref i, xlines, out string kw, 5, "\n");
item.Descriptions.Add(new EMBL_ELEMENT(kw, rs));
continue;
}
}
if (item != null)
{
Items.Add(item);
}
}
catch (Exception ex)
{
throw new Exception("EMBL_File() ERROR: " + ex.Message);
}
}
public static EMBL_File FromFile(string filename)
{
try
{
string buf = File.ReadAllText(filename);
return new EMBL_File(buf);
}
catch (Exception ex)
{
throw new Exception("EMBL_File() ERROR: " + ex.Message);
}
}
public void Write_Json(string filename)
{
try
{
File.WriteAllText(filename, SimpleJson.SerializeObject(Items));
}
catch (Exception ex)
{
throw new Exception("EMBL_File.Write_Json ERROR: " + ex.Message);
}
}
/// <summary>
/// 提取 FASTA 序列信息
/// </summary>
/// <returns></returns>
public string Fasta_Sequences()
{
StringBuilder sb = new StringBuilder();
foreach (EMBL_Item item in Items)
{
sb.AppendLine(">" + item.Find("DE"));
sb.AppendLine(B.BreakTo(item.Sequence));
sb.AppendLine("");
}
return sb.ToString();
}
/// <summary>
/// 输出一些属性数据
/// source: db_xref, mRNA: gene, CDS: codon_start
/// </summary>
/// <returns></returns>
public string Print_Features()
{
StringBuilder sb = new StringBuilder();
foreach (EMBL_Item item in Items)
{
// 提取指定的一些属性条目
foreach (EMBL_FEATURE feature in item.Features)
{
if (feature.FeatureList.Count > 1)
{
sb.AppendLine(">" + feature.Name + " " + feature.FeatureList[1]);
sb.AppendLine(B.BreakTo(Utility.SequenceByPosition(item.Sequence, feature.PositionList)));
sb.AppendLine("");
}
}
}
return sb.ToString();
}
/// <summary>
/// 提取属性中的 translation 为蛋白质序列
/// </summary>
/// <returns></returns>
public string Protein()
{
StringBuilder sb = new StringBuilder();
foreach (EMBL_Item item in Items)
{
foreach (EMBL_FEATURE feature in item.Features)
{
string tr = feature.FindBranch("CDS", "translation");
if (tr.Length > 0)
{
sb.AppendLine(">" + feature.Name + " " + feature.FeatureList[1]);
sb.AppendLine(B.BreakTo(tr.Replace(" ", "").Replace("\"", "")));
sb.AppendLine("");
}
}
}
return sb.ToString();
}
}
}
以这样的格式发布代码是一种新尝试,或许可避开内容太少。