Skip to content

Instantly share code, notes, and snippets.

@jairov4
Created October 15, 2012 23:54
Show Gist options
  • Save jairov4/3896434 to your computer and use it in GitHub Desktop.
Save jairov4/3896434 to your computer and use it in GitHub Desktop.
Sampler for CleavageSites from ProteinTagger
using Biosek;
using Biosek.Formats.UniProt;
using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace Sampler
{
struct csite
{
public string accession;
public int location;
}
class Program
{
static void Main(string[] args)
{
var inputFile = args[0];
var uniprotFile = args[1];
var outputFolder = args[2];
var testMultiple = args.Skip(3).Contains("--testMultiple");
var setPrefix = testMultiple ? 0 : int.Parse(args.Skip(3).First(x => x.StartsWith("--prefix=")).Substring(9));
var setSuffix = testMultiple ? 0 : int.Parse(args.Skip(3).First(x => x.StartsWith("--suffix=")).Substring(9));
var outputFile = Path.Combine(outputFolder, Path.GetFileNameWithoutExtension(inputFile));
var minWindowLength = 5;
var maxWindowLength = 30;
var db_temp = uniprot.LoadFromFile(uniprotFile);
var db = db_temp.entry.ToDictionary(x => x.accession[0], x => x.GetSequenceAsByteString());
db_temp = null; // release ref
Console.WriteLine("Loaded UniProt XML file");
var cs_temp = (JArray)JsonSerializerHelper.LoadJsonFile(inputFile);
var cleavageSites = cs_temp.Select(x => new csite { accession = (string)x["Accession"], location = (int)x["LocationBegin"] }).ToArray();
cs_temp = null; // release ref
Console.WriteLine("Loaded Cleavage Sites JSON file");
var pos = new ByteArraySet();
var neg = new ByteArraySet();
var ovl = new ByteArraySet();
var seqs = new List<byte[]>();
var random = new Random();
if (testMultiple)
{
for (int windowLength = minWindowLength; windowLength < maxWindowLength; windowLength++)
{
for (int prefixLength = 0; prefixLength < windowLength; prefixLength++)
{
int suffixLength = windowLength - prefixLength;
SampleOne(random, outputFile, db, cleavageSites, pos, neg, ovl, seqs, prefixLength, suffixLength);
}
}
}
else
{
var prefixLength = setPrefix;
var suffixLength = setSuffix;
var windowLength = prefixLength + suffixLength;
SampleOne(random, outputFile, db, cleavageSites, pos, neg, ovl, seqs, prefixLength, suffixLength);
}
}
private static void SampleOne(Random random, string outputFile, Dictionary<string, byte[]> db, csite[] cleavageSites, ByteArraySet pos, ByteArraySet neg, ByteArraySet ovl, List<byte[]> seqs, int prefixLength, int suffixLength)
{
var windowLength = prefixLength + suffixLength;
pos.Clear();
neg.Clear();
ovl.Clear();
seqs.Clear();
var cdr = SampleSeqs(db, cleavageSites, pos, neg, ovl, seqs, windowLength, prefixLength);
var posCount = pos.Count;
var negCount = neg.Count;
var ovlCount = ovl.Count;
var ofn = string.Format("{0}_{1}_{2}", outputFile, prefixLength, suffixLength);
var rp = pos.Count * 80 / 100;
var rn = neg.Count * 80 / 100;
var train_pos = pos.Take(rp);
var train_neg = neg.Take(rn);
var train_concat = SamplerHelper.TagAndConcat(train_neg, train_pos);
var test_pos = pos.Skip(rp);
var test_neg = neg.Skip(rn);
var test_concat = SamplerHelper.TagAndConcat(test_neg, test_pos);
NumericCsvHelper.SavePlain(ofn + "_train.csv", train_concat);
DestinoHelper.SavePlain(ofn + "_train.destino", train_concat);
NumericCsvHelper.SavePlain(ofn + "_test.csv", test_concat);
DestinoHelper.SavePlain(ofn + "_test.destino", test_concat);
// balanced dataset
var balanced_train_neg = train_neg.Shuffle(random).Take(train_pos.Count() * 10); // 10:1
var balanced_train_concat = SamplerHelper.TagAndConcat(balanced_train_neg, train_pos);
NumericCsvHelper.SavePlain(ofn + "_train_balanced.csv", balanced_train_concat);
DestinoHelper.SavePlain(ofn + "_train_balanced.destino", balanced_train_concat);
JsonSerializerHelper.SaveJsonFile(new { collisionsResolved = ovlCount, negCount, posCount, cdr, prefixLength, suffixLength, trainSize = train_concat.Count(), testSize = test_concat.Count() }, ofn + ".json");
Console.WriteLine("Window {0}:{1} => P:{2} N:{3} | collisions: {4}", prefixLength, suffixLength, posCount, negCount, ovlCount);
}
private static int? SampleSeqs(Dictionary<string, byte[]> db, csite[] cleavageSites, ByteArraySet pos, ByteArraySet neg, ByteArraySet ovl, List<byte[]> seqs, int windowLength, int prefixLength)
{
foreach (var item in cleavageSites)
{
var seq = db[item.accession];
seqs.Add(seq);
SamplerHelper.SampleWithSlidingWindow(seq, windowLength, item.location - prefixLength, neg, pos);
}
ovl.UnionWith(pos);
ovl.IntersectWith(neg);
var cdr = EvaluateCollisionDynamicRange(seqs, ovl);
// resolve
pos.UnionWith(ovl);
neg.ExceptWith(ovl);
return cdr;
}
/// <summary>
/// Obtiene una medida de rango dinamico para las colisiones en una coleccion de secuencias.
/// El rango dinamico que se calcula en esta funcion obtiene el mayor desplazamiento que tuvo una colision
/// en todo el conjunto de secuencias. Esto brinda un indicador de la bondad de una configuracion de muestreo
/// porque ayuda a identificar secuencias donde hay un posible mal etiquetado.
/// </summary>
/// <example>
/// sean colisiones C1={10,9,11} y C2={13,12,14} y el conjunto de secuencias SEQS esta formado por 4 secuencias.
/// C1 ocurre en las secuencias de SEQS en estos indices I1={10,10,30,31}
/// C2 ocurre en las secuencias de SEQS en estos indices I2={5,6,7,5}
/// El desplazamiento maximo para C1 es MCO1=max(I1)-min(I1)=31-10=21
/// El desplazamiento maximo para C2 es MCO2=max(I2)-min(I2)=7-5=2
/// El desplazamiento maxmo para todas las colisiones es MCO=max(MCO1,MCO2)=21
/// </example>
static int? EvaluateCollisionDynamicRange(ICollection<byte[]> collection, ICollection<byte[]> collisions)
{
if (collisions.Count == 0) return null;
var s = from collision in collisions
from seq in collection
// todas las ocurrencias de la colision
from i in IndexOfAll(seq, collision)
group i by collision into grp
select grp.Max() - grp.Min();
var co = s.Max();
return co;
}
/// <summary>
/// Encuentra todas las ocurrencias de <param name="pat"></param> en <param name="col"></param>
/// </summary>
static IEnumerable<int> IndexOfAll(byte[] col, byte[] pat)
{
for (int i = 0; i < col.Length - pat.Length + 1; i++)
{
bool found = true;
for (int j = 0; j < pat.Length; j++)
{
if (col[i + j] != pat[j])
{
found = false;
break;
}
}
if (found) yield return i;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment