Created
November 1, 2016 18:55
-
-
Save daler/24227fed6f5fbcfbdd29646a653a9cca to your computer and use it in GitHub Desktop.
toy examples to show how multiintersect works
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Run the script to see example output. | |
Modify the features in a, b, and c to see how they affect the output | |
pybedtools.contrib.plotting.BinaryHeatmap uses the "cluster" method, | |
but unclustered and naive merge versions are shown as well. | |
""" | |
import numpy as np | |
import pybedtools | |
def ascii_bed(x, lim): | |
""" | |
Prints a single-line ASCII representation of a BED file, replacing the | |
coordinates with the one-character name of each feature. | |
""" | |
first = False | |
y = x.sort() | |
a = np.zeros((lim,), dtype=str) | |
a[:] = '.' | |
for i in y: | |
a[i.start:i.stop] = i.name | |
return ''.join(list(a)) | |
# Play around with the coordinates of a, b, and c. | |
# | |
# The code() function below expects the names to be "a", "b", or "c". | |
a = pybedtools.BedTool( | |
""" | |
chr1 6 12 a | |
chr1 14 20 a | |
chr1 22 30 a | |
""", from_string=True) | |
b = pybedtools.BedTool( | |
""" | |
chr1 14 32 b | |
""", from_string=True) | |
c = pybedtools.BedTool( | |
""" | |
chr1 10 13 c | |
""", from_string=True) | |
unclustered = pybedtools.BedTool().multi_intersect( | |
i=[a.fn, b.fn, c.fn]) | |
clustered = pybedtools.BedTool().multi_intersect( | |
i=[a.fn, b.fn, c.fn], cluster=True) | |
def code(n): | |
""" | |
Get a unique code for each combination of characters so the combinatorial | |
binding can be represented by a single character. | |
e.g., | |
0 = no feature | |
1 = a | |
3 = b | |
4 = a+b | |
5 = c | |
6 = c+a | |
8 = b+c | |
9 = a+b+c | |
""" | |
i = 0 | |
if 'a' in n: | |
i += 1 | |
if 'b' in n: | |
i += 3 | |
if 'c' in n: | |
i += 5 | |
return i | |
def cluster_rename(x): | |
d = {'1': 'a', '2':'b', '3':'c'} | |
letters = [d[i] for i in x[4].split(',')] | |
x.name = str(code(letters)) | |
return x | |
def fix(x): | |
""" | |
After merging distinct, split out the letters into a code and set that code | |
as the feature's name. | |
""" | |
n = x.name.split(',') | |
x.name = str(code(n)) | |
return x | |
# An alternative strategy: concatenate all files, merge them together keeping | |
# track of the sets of names, and convert to codes (see the code() function). | |
f = a.cat(b, c, postmerge=False).sort().merge(c=4, o='distinct') | |
print("a:\n{}".format(a)) | |
print("b:\n{}".format(b)) | |
print("c:\n{}".format(c)) | |
print("d (multiintersect; cluster=False):\n{}".format(unclustered)) | |
print("e (multiintersect; cluster=True):\n{}".format(clustered)) | |
print("f (cat-and-merge):\n{}".format(f)) | |
number_line = '01234567890123456789012345678901234567890' | |
lim = max(max(i.stop for i in x) for x in [a,b,c]) | |
print(number_line) | |
print('ASCII version of a, b, c:\n' + '\n'.join([ascii_bed(i, lim) for i in [a, b, c]])) | |
print("\nclustered:") | |
print(ascii_bed(clustered.each(cluster_rename).saveas(), lim)) | |
print("\nunclustered:") | |
print(ascii_bed(unclustered.each(cluster_rename).saveas(), lim)) | |
print('\nmerged:') | |
print(ascii_bed(f.each(fix).saveas(), lim)) | |
print('key:') | |
print('\n'.join('{0:>5}: {1}'.format(i, code(i)) for i in ['a', 'b', 'c', 'ab', 'ac', 'bc', 'abc'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment