Created
October 15, 2012 23:54
-
-
Save jairov4/3896434 to your computer and use it in GitHub Desktop.
Sampler for CleavageSites from ProteinTagger
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Biosek; | |
using Biosek.Formats.UniProt; | |
using Newtonsoft.Json.Linq; | |
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
namespace Sampler | |
{ | |
struct csite | |
{ | |
public string accession; | |
public int location; | |
} | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
var inputFile = args[0]; | |
var uniprotFile = args[1]; | |
var outputFolder = args[2]; | |
var testMultiple = args.Skip(3).Contains("--testMultiple"); | |
var setPrefix = testMultiple ? 0 : int.Parse(args.Skip(3).First(x => x.StartsWith("--prefix=")).Substring(9)); | |
var setSuffix = testMultiple ? 0 : int.Parse(args.Skip(3).First(x => x.StartsWith("--suffix=")).Substring(9)); | |
var outputFile = Path.Combine(outputFolder, Path.GetFileNameWithoutExtension(inputFile)); | |
var minWindowLength = 5; | |
var maxWindowLength = 30; | |
var db_temp = uniprot.LoadFromFile(uniprotFile); | |
var db = db_temp.entry.ToDictionary(x => x.accession[0], x => x.GetSequenceAsByteString()); | |
db_temp = null; // release ref | |
Console.WriteLine("Loaded UniProt XML file"); | |
var cs_temp = (JArray)JsonSerializerHelper.LoadJsonFile(inputFile); | |
var cleavageSites = cs_temp.Select(x => new csite { accession = (string)x["Accession"], location = (int)x["LocationBegin"] }).ToArray(); | |
cs_temp = null; // release ref | |
Console.WriteLine("Loaded Cleavage Sites JSON file"); | |
var pos = new ByteArraySet(); | |
var neg = new ByteArraySet(); | |
var ovl = new ByteArraySet(); | |
var seqs = new List<byte[]>(); | |
var random = new Random(); | |
if (testMultiple) | |
{ | |
for (int windowLength = minWindowLength; windowLength < maxWindowLength; windowLength++) | |
{ | |
for (int prefixLength = 0; prefixLength < windowLength; prefixLength++) | |
{ | |
int suffixLength = windowLength - prefixLength; | |
SampleOne(random, outputFile, db, cleavageSites, pos, neg, ovl, seqs, prefixLength, suffixLength); | |
} | |
} | |
} | |
else | |
{ | |
var prefixLength = setPrefix; | |
var suffixLength = setSuffix; | |
var windowLength = prefixLength + suffixLength; | |
SampleOne(random, outputFile, db, cleavageSites, pos, neg, ovl, seqs, prefixLength, suffixLength); | |
} | |
} | |
private static void SampleOne(Random random, string outputFile, Dictionary<string, byte[]> db, csite[] cleavageSites, ByteArraySet pos, ByteArraySet neg, ByteArraySet ovl, List<byte[]> seqs, int prefixLength, int suffixLength) | |
{ | |
var windowLength = prefixLength + suffixLength; | |
pos.Clear(); | |
neg.Clear(); | |
ovl.Clear(); | |
seqs.Clear(); | |
var cdr = SampleSeqs(db, cleavageSites, pos, neg, ovl, seqs, windowLength, prefixLength); | |
var posCount = pos.Count; | |
var negCount = neg.Count; | |
var ovlCount = ovl.Count; | |
var ofn = string.Format("{0}_{1}_{2}", outputFile, prefixLength, suffixLength); | |
var rp = pos.Count * 80 / 100; | |
var rn = neg.Count * 80 / 100; | |
var train_pos = pos.Take(rp); | |
var train_neg = neg.Take(rn); | |
var train_concat = SamplerHelper.TagAndConcat(train_neg, train_pos); | |
var test_pos = pos.Skip(rp); | |
var test_neg = neg.Skip(rn); | |
var test_concat = SamplerHelper.TagAndConcat(test_neg, test_pos); | |
NumericCsvHelper.SavePlain(ofn + "_train.csv", train_concat); | |
DestinoHelper.SavePlain(ofn + "_train.destino", train_concat); | |
NumericCsvHelper.SavePlain(ofn + "_test.csv", test_concat); | |
DestinoHelper.SavePlain(ofn + "_test.destino", test_concat); | |
// balanced dataset | |
var balanced_train_neg = train_neg.Shuffle(random).Take(train_pos.Count() * 10); // 10:1 | |
var balanced_train_concat = SamplerHelper.TagAndConcat(balanced_train_neg, train_pos); | |
NumericCsvHelper.SavePlain(ofn + "_train_balanced.csv", balanced_train_concat); | |
DestinoHelper.SavePlain(ofn + "_train_balanced.destino", balanced_train_concat); | |
JsonSerializerHelper.SaveJsonFile(new { collisionsResolved = ovlCount, negCount, posCount, cdr, prefixLength, suffixLength, trainSize = train_concat.Count(), testSize = test_concat.Count() }, ofn + ".json"); | |
Console.WriteLine("Window {0}:{1} => P:{2} N:{3} | collisions: {4}", prefixLength, suffixLength, posCount, negCount, ovlCount); | |
} | |
private static int? SampleSeqs(Dictionary<string, byte[]> db, csite[] cleavageSites, ByteArraySet pos, ByteArraySet neg, ByteArraySet ovl, List<byte[]> seqs, int windowLength, int prefixLength) | |
{ | |
foreach (var item in cleavageSites) | |
{ | |
var seq = db[item.accession]; | |
seqs.Add(seq); | |
SamplerHelper.SampleWithSlidingWindow(seq, windowLength, item.location - prefixLength, neg, pos); | |
} | |
ovl.UnionWith(pos); | |
ovl.IntersectWith(neg); | |
var cdr = EvaluateCollisionDynamicRange(seqs, ovl); | |
// resolve | |
pos.UnionWith(ovl); | |
neg.ExceptWith(ovl); | |
return cdr; | |
} | |
/// <summary> | |
/// Obtiene una medida de rango dinamico para las colisiones en una coleccion de secuencias. | |
/// El rango dinamico que se calcula en esta funcion obtiene el mayor desplazamiento que tuvo una colision | |
/// en todo el conjunto de secuencias. Esto brinda un indicador de la bondad de una configuracion de muestreo | |
/// porque ayuda a identificar secuencias donde hay un posible mal etiquetado. | |
/// </summary> | |
/// <example> | |
/// sean colisiones C1={10,9,11} y C2={13,12,14} y el conjunto de secuencias SEQS esta formado por 4 secuencias. | |
/// C1 ocurre en las secuencias de SEQS en estos indices I1={10,10,30,31} | |
/// C2 ocurre en las secuencias de SEQS en estos indices I2={5,6,7,5} | |
/// El desplazamiento maximo para C1 es MCO1=max(I1)-min(I1)=31-10=21 | |
/// El desplazamiento maximo para C2 es MCO2=max(I2)-min(I2)=7-5=2 | |
/// El desplazamiento maxmo para todas las colisiones es MCO=max(MCO1,MCO2)=21 | |
/// </example> | |
static int? EvaluateCollisionDynamicRange(ICollection<byte[]> collection, ICollection<byte[]> collisions) | |
{ | |
if (collisions.Count == 0) return null; | |
var s = from collision in collisions | |
from seq in collection | |
// todas las ocurrencias de la colision | |
from i in IndexOfAll(seq, collision) | |
group i by collision into grp | |
select grp.Max() - grp.Min(); | |
var co = s.Max(); | |
return co; | |
} | |
/// <summary> | |
/// Encuentra todas las ocurrencias de <param name="pat"></param> en <param name="col"></param> | |
/// </summary> | |
static IEnumerable<int> IndexOfAll(byte[] col, byte[] pat) | |
{ | |
for (int i = 0; i < col.Length - pat.Length + 1; i++) | |
{ | |
bool found = true; | |
for (int j = 0; j < pat.Length; j++) | |
{ | |
if (col[i + j] != pat[j]) | |
{ | |
found = false; | |
break; | |
} | |
} | |
if (found) yield return i; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment