Raincode · April 12, 2019 19:15
diff --git a/huffman.m b/huffman.m
 % creates an optimal code for probability vector P using huffman coding
 % The codes are ordered according to the input probabilities.
 % Alternatively, you could sort them descending, or add another column to the cell
 % array containing the symbols (like the huffmandict function). This would require an
 % additional input parameter.

 % My measurements show this running faster than the huffmandict function.

 function [c,h,w]=huffman(P)
 % function [c,h,w]=huffman(P)
 % 
 % Huffman-Codierung
 %
 % P :   Auftrittswahrscheinlichkeiten
 % c :   Code als Cell-Array
 % h :   Entropie
 % w :   mittlere Codewortlänge
 % 
 % Für den Testvektor
 % P=[0.05 0.03 0.17 0.23 0.01 0.32 0.19]
 % ergibt sich h=2.3378 und w=2.39.

 % nützliche Matlab-Befehle: struct, sort
 assert(abs(sum(P) - 1) < 10e-6, "Probabilities must sum up to 100%");

 % compute entropy
 h = sum(P .* (-log2(P)));
 % each row corresponds to the probability in P
 c = cell(length(P), 1); % codes are represent as numerical vectors for bits

 % A symbol is used to represent dummy "fused" probabilities as well
 % Using the indices from sort ensures that the code cell array (i)
 % correspons to the input P(i)
 [P, indices] = sort(P, 'descend');
 symbols = struct('probability', num2cell(P), 'indices', num2cell(indices));
 % P = sort(P, 'descend');
 % symbols = struct('probability', num2cell(P), 'indices', num2cell(1:length(P)));

 while length(symbols) > 1
    % select the two lowest probabilities and add them
    % O(n) insert worst case vs log(n) binary search...
    last = symbols(end);
    preLast = symbols(end-1);
    % Build the code words by prepending bits
    c(last.indices) = cellfun(@(x)[0 x], c(last.indices), 'UniformOutput', false);
    c(preLast.indices) = cellfun(@(x)[1 x], c(preLast.indices), 'UniformOutput', false);
    % Insert dummy symbol representing combined probability of the two
    % lowest probabilities
    probSum = last.probability + preLast.probability;
    newSymbol = struct('probability', probSum, 'indices', [last.indices preLast.indices]);
    pos = find([symbols.probability] < probSum, 1);
    % insert dummy symbol and remove the two symbols which belong to it
    symbols = [symbols(1:pos-1) newSymbol symbols(pos:end-2)];
 end
 assert(length(symbols) == 1 && abs(symbols(1).probability - 1) < 10e-6, "Probability of tree root must add up to 100%");
 % compute average codeword length
 w = sum(cellfun('length', c) .* P(:));
	% creates an optimal code for probability vector P using huffman coding
	% The codes are ordered according to the input probabilities.
	% Alternatively, you could sort them descending, or add another column to the cell
	% array containing the symbols (like the huffmandict function). This would require an
	% additional input parameter.

	% My measurements show this running faster than the huffmandict function.

	function [c,h,w]=huffman(P)
	% function [c,h,w]=huffman(P)
	%
	% Huffman-Codierung
	%
	% P : Auftrittswahrscheinlichkeiten
	% c : Code als Cell-Array
	% h : Entropie
	% w : mittlere Codewortlänge
	%
	% Für den Testvektor
	% P=[0.05 0.03 0.17 0.23 0.01 0.32 0.19]
	% ergibt sich h=2.3378 und w=2.39.

	% nützliche Matlab-Befehle: struct, sort
	assert(abs(sum(P) - 1) < 10e-6, "Probabilities must sum up to 100%");

	% compute entropy
	h = sum(P .* (-log2(P)));
	% each row corresponds to the probability in P
	c = cell(length(P), 1); % codes are represent as numerical vectors for bits

	% A symbol is used to represent dummy "fused" probabilities as well
	% Using the indices from sort ensures that the code cell array (i)
	% correspons to the input P(i)
	[P, indices] = sort(P, 'descend');
	symbols = struct('probability', num2cell(P), 'indices', num2cell(indices));
	% P = sort(P, 'descend');
	% symbols = struct('probability', num2cell(P), 'indices', num2cell(1:length(P)));

	while length(symbols) > 1
	% select the two lowest probabilities and add them
	% O(n) insert worst case vs log(n) binary search...
	last = symbols(end);
	preLast = symbols(end-1);
	% Build the code words by prepending bits
	c(last.indices) = cellfun(@(x)[0 x], c(last.indices), 'UniformOutput', false);
	c(preLast.indices) = cellfun(@(x)[1 x], c(preLast.indices), 'UniformOutput', false);
	% Insert dummy symbol representing combined probability of the two
	% lowest probabilities
	probSum = last.probability + preLast.probability;
	newSymbol = struct('probability', probSum, 'indices', [last.indices preLast.indices]);
	pos = find([symbols.probability] < probSum, 1);
	% insert dummy symbol and remove the two symbols which belong to it
	symbols = [symbols(1:pos-1) newSymbol symbols(pos:end-2)];
	end
	assert(length(symbols) == 1 && abs(symbols(1).probability - 1) < 10e-6, "Probability of tree root must add up to 100%");
	% compute average codeword length
	w = sum(cellfun('length', c) .* P(:));