Last active
April 28, 2019 23:24
-
-
Save jiahao/c673c29c40a7075207e2b01493ec4d0b to your computer and use it in GitHub Desktop.
Multinomial naive Bayes in Julia, allowing for generic numeric types for the conditional probabilities. When using rational numbers, you can calculate exact probabilities without roundoff error.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct MultinomialNaiveBayes{T, V<:AbstractVector} | |
feature_ratios::V | |
prior_ratio::T | |
end | |
""" | |
fit(MultinomialNaiveBayes, [T,] features, labels, α = 1) -> MNB | |
fits a `MultinomialNaiveBayes` classifier `MNB` using the | |
`features` matrix and `labels` vector of `Bool`s. | |
The number of columns in `features` is assumed to be the same as the | |
length of `labels`. | |
`T` is the working numeric type, which defaults to `Float32`. | |
Other numeric types like `T = Rational` are possible, which permit | |
testing exact values of conditional probabilities. | |
`α` is a additive smoothing parameter used for regularization, | |
with 1 being Laplace smoothing, 0.5 being Jeffrys smoothing, | |
and other values being the more general Lidstone smoothing. | |
""" | |
function fit(::Type{MultinomialNaiveBayes{T, S}}, features, labels, α = 1) where | |
{T<:Real, S<:AbstractVector{T}} | |
m, n = size(features) | |
feature_ratios = zeros(T, n) #probability ratios | |
#Calculate prior ratio | |
counts = zeros(Int, 2) | |
counts_category = zeros(Int, 2) | |
for i in 1:m | |
k = if labels[i] 1 else 2 end | |
counts[k] += 1 | |
counts_category[k] += sum(view(features, i, :)) | |
end | |
prior_ratio = T(counts[1]) / T(counts[2]) | |
#Calculate feature ratios | |
counts = zeros(Int, 2) | |
for j in 1:n | |
counts[1] = counts[2] = 0 | |
for i in 1:m | |
k = if labels[i] 1 else 2 end | |
counts[k] += features[i, j] | |
end | |
feature_ratios[j] = (T(counts[1] + α) / T(counts_category[1] + α * n)) / | |
(T(counts[2] + α) / T(counts_category[2] + α * n)) | |
end | |
return MultinomialNaiveBayes{T, S}(feature_ratios, prior_ratio) | |
end | |
#Default working numeric type | |
fit(::Type{MultinomialNaiveBayes}, features, labels, α = 1) = | |
fit(MultinomialNaiveBayes, Float32, features, labels, α) | |
#Expand out working numeric type into parameters of MultinomialNaiveBayes | |
fit(::Type{MultinomialNaiveBayes}, T::Type{R}, features, labels, α = 1) where R<:Real = | |
fit(MultinomialNaiveBayes{T, Vector{T}}, features, labels, α) | |
""" | |
predict(MNB, [T,] features) -> pr_true | |
predicts the probabilities of the label being `true` or `false` | |
from the `MultinomialNaiveBayes` classifier `MNB` and the `features`. | |
The optional numeric type `T` specifies the working type to calculate | |
the return probabilities, which can differ from the numeric type used | |
to train the `MNB` model. | |
""" | |
function predict(MNB::MultinomialNaiveBayes, T, features) | |
pr = predictratio(MNB, T, features) | |
return pr / (1 + pr) | |
end | |
function predictratio(MNB::MultinomialNaiveBayes, T, features) | |
#Initialize probability ratio with prior | |
pr = T(MNB.prior_ratio) | |
for (j, x) in enumerate(features) | |
if x != 0 | |
pr *= (T(MNB.feature_ratios[j]))^x | |
end | |
end | |
return pr | |
end | |
""" | |
predictlogratio(MNB, [T,] features) -> log_ratio | |
predicts the log of the ratio of probabilities that the label is | |
`true` or `false` from the `MultinomialNaiveBayes` classifier `MNB` | |
and the `features`. | |
The optional numeric type `T` specifies the working type to calculate | |
the return probabilities, which can differ from the numeric type used | |
to train the `MNB` model. | |
This function will be slower and restricted to fewer numeric types, but | |
can offer better precision when extremely small or large ratios of | |
conditional probabilities exist. | |
""" | |
function predictlogratio(MNB::MultinomialNaiveBayes, T, features) | |
logpr = T(log(MNB.prior_ratio)) | |
for (j, x) in enumerate(features) | |
if x != 0 | |
logpr += x * T(log(MNB.feature_ratios[j])) | |
end | |
end | |
return logpr | |
end | |
#The default working numeric type is the type used to train the classifier | |
for f in (:predict, :predictlog, :predictlogratio) @eval begin | |
$f(MNB::MultinomialNaiveBayes{T}, features) where T = | |
$f(MNB, T, features) | |
end end | |
############################################################################## | |
# A simple test | |
############################################################################## | |
#Example from | |
#https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html | |
using SparseArrays | |
featureset = SparseMatrixCSC( | |
#Chinese Beijing Shanghai Macao Japan Tokyo | |
[2 1 0 0 0 0 | |
2 0 1 0 0 0 | |
1 0 0 1 0 0 | |
1 0 0 0 1 1]) | |
labels = BitVector([true, true, true, false]) | |
MNB = fit(MultinomialNaiveBayes, Rational, featureset, labels, 1) | |
upr_yes = 3//4 * (3//7)^3 * 1//14 * 1//14 | |
upr_no = 1//4 * (2//9)^3 * 2//9 * 2//9 | |
pr_yes = upr_yes // (upr_yes + upr_no) | |
testfeat = SparseVector([3, 0, 0, 0, 1, 1]) | |
pred_yes = predict(MNB, Rational, testfeat) | |
@assert pred_yes == pr_yes | |
r = predictratio(MNB, Float32, testfeat) | |
lr = predictlogratio(MNB, Float32, testfeat) | |
@assert log(r) ≈ lr | |
@assert r/(1+r) ≈ pred_yes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment