Created
November 24, 2015 22:04
-
-
Save devdazed/69fcd98b6655a35c092b to your computer and use it in GitHub Desktop.
Count tombstones in a Cassandra Table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Counts the number of tombstones in a keyspace.table and reports the top N highest counts | |
tombstone_count.py | |
[-h] This help screen | |
[--data-dir DATA_DIR] The C* data directory (/var/lib/cassandra/data) | |
[--top-k TOP_K] The top number of keys with highest tombstone counts to display. | |
keyspace The keyspace that contains the table | |
table The table to count tombstones | |
""" | |
from collections import Counter | |
import argparse | |
import glob | |
import json | |
import operator | |
import subprocess | |
def sizeof_fmt(num, suffix='B'): | |
for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']: | |
if abs(num) < 1024.0: | |
return "%3.1f%s%s" % (num, unit, suffix) | |
num /= 1024.0 | |
return "%.1f%s%s" % (num, 'Yi', suffix) | |
class SSTableReader(object): | |
def __init__(self, sstable): | |
self._sstable = sstable | |
self._proc = None | |
self._opened = False | |
self.num_bytes = 0 | |
def __iter__(self): | |
return self | |
def _read_n(self, n=1): | |
self._proc.stdout.read(n) | |
self.num_bytes += n | |
def _next_object(self): | |
buf = [] | |
for char in self.read(): | |
if char == '\n': | |
continue | |
if len(buf) == 0 and char != '{': | |
raise ValueError('Invalid JSON Object Start Char: {0} ({1})'.format(char, ord(char))) | |
buf.append(char) | |
# the object ends with a `}`, so each one we see we try to marshal | |
# if the marshal works, the object is complete | |
if char == '}': | |
try: | |
row = json.loads(''.join(buf)) | |
self._read_n(2) # skip past the next two chars `,\n` | |
return row | |
except ValueError: | |
# if we can't marshal the object, then continue reading | |
continue | |
def open(self): | |
self._proc = subprocess.Popen(['sstable2json', self._sstable], stdout=subprocess.PIPE, bufsize=1048576) | |
def read(self): | |
for c in iter(lambda: self._proc.stdout.read(1), ''): | |
yield c | |
self.num_bytes += 1 | |
def next(self): | |
if not self._opened: | |
self.open() | |
self._opened = True | |
self._read_n(2) # skip past the first two chars `[\n` | |
next_object = self._next_object() | |
if next_object is None: | |
raise StopIteration() | |
return next_object | |
class TombstoneCounter(object): | |
def __init__(self, keyspace, table, data_dir): | |
self._data_dir = data_dir | |
self._keyspace = keyspace | |
self._table = table | |
self._sstable_count = 0 | |
self._total_bytes = 0 | |
self._tombstones = Counter() | |
@staticmethod | |
def read_sstable_json(sstable): | |
print 'Reading {0}'.format(sstable) | |
reader = SSTableReader(sstable) | |
return reader | |
def sstable_files(self): | |
tables = glob.glob('{0}/{1}/{2}/*-Data.db'.format(self._data_dir, self._keyspace, self._table)) | |
self._sstable_count = len(tables) | |
print 'Found {0} sstables'.format(self._sstable_count) | |
return tables | |
def count_tombstones(self): | |
for sstable in self.sstable_files(): | |
self.count_tombstones_in_sstable(sstable) | |
def count_tombstones_in_row(self, row): | |
for cell in row['cells']: | |
if len(cell) > 3 and cell[3] == 't': | |
self._tombstones[row['key']] += 1 | |
def count_tombstones_in_sstable(self, sstable): | |
reader = self.read_sstable_json(sstable) | |
for row in reader: | |
self.count_tombstones_in_row(row) | |
self._total_bytes += reader.num_bytes | |
def report(self, top): | |
sorted_tombstones = sorted(self._tombstones.items(), key=operator.itemgetter(1)) | |
sorted_tombstones.reverse() | |
print 'Read {0} keys and {1} of data'.format(len(sorted_tombstones), sizeof_fmt(self._total_bytes)) | |
print 'Top {0} keys with highest number tombstones'.format(top) | |
n = 0 | |
for pair in sorted_tombstones[0:top]: | |
n += 1 | |
print "{0:3} {1} => {2}".format(str(n) + '.', pair[0], pair[1]) | |
def main(): | |
parser = argparse.ArgumentParser(usage=__doc__) | |
parser.add_argument('keyspace') | |
parser.add_argument('table') | |
parser.add_argument('--data-dir', default='/var/lib/cassandra/data') | |
parser.add_argument('--top-k', default=25) | |
args = parser.parse_args() | |
t = TombstoneCounter(args.keyspace, args.table, args.data_dir) | |
t.count_tombstones() | |
t.report(args.top_k) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment