jiahao · April 28, 2019 23:24
diff --git a/naivebayes.jl b/naivebayes.jl
 struct MultinomialNaiveBayes{T, V<:AbstractVector}
    feature_ratios::V
    prior_ratio::T
 end

 """
    fit(MultinomialNaiveBayes, [T,] features, labels, α = 1) -> MNB

 fits a `MultinomialNaiveBayes` classifier `MNB` using the
 `features` matrix and `labels` vector of `Bool`s.

 The number of columns in `features` is assumed to be the same as the
 length of `labels`.

 `T` is the working numeric type, which defaults to `Float32`.
 Other numeric types like `T = Rational` are possible, which permit
 testing exact values of conditional probabilities.

 `α` is a additive smoothing parameter used for regularization,
 with 1 being Laplace smoothing, 0.5 being Jeffrys smoothing,
 and other values being the more general Lidstone smoothing. 
 """
 function fit(::Type{MultinomialNaiveBayes{T, S}}, features, labels, α = 1) where
    {T<:Real, S<:AbstractVector{T}}
    
    m, n = size(features)
    feature_ratios = zeros(T, n) #probability ratios

    #Calculate prior ratio
    counts = zeros(Int, 2)
    counts_category = zeros(Int, 2)
    for i in 1:m
        k = if labels[i] 1 else 2 end
        counts[k] += 1
        counts_category[k] += sum(view(features, i, :))
    end
    prior_ratio = T(counts[1]) / T(counts[2])

    #Calculate feature ratios
    counts = zeros(Int, 2)
    for j in 1:n
        counts[1] = counts[2] = 0
        for i in 1:m
            k = if labels[i] 1 else 2 end
            counts[k] += features[i, j]
        end

        feature_ratios[j] = (T(counts[1] + α) / T(counts_category[1] + α * n)) /
            (T(counts[2] + α) / T(counts_category[2] + α * n))
    end

    return MultinomialNaiveBayes{T, S}(feature_ratios, prior_ratio)
 end

 #Default working numeric type
 fit(::Type{MultinomialNaiveBayes}, features, labels, α = 1) =
    fit(MultinomialNaiveBayes, Float32, features, labels, α)

 #Expand out working numeric type into parameters of MultinomialNaiveBayes
 fit(::Type{MultinomialNaiveBayes}, T::Type{R}, features, labels, α = 1) where R<:Real =
    fit(MultinomialNaiveBayes{T, Vector{T}}, features, labels, α)



 """
    predict(MNB, [T,] features) -> pr_true
    
 predicts the probabilities of the label being `true` or `false`
 from the `MultinomialNaiveBayes` classifier `MNB` and the `features`.

 The optional numeric type `T` specifies the working type to calculate
 the return probabilities, which can differ from the numeric type used
 to train the `MNB` model.
 """
 function predict(MNB::MultinomialNaiveBayes, T, features)
    pr = predictratio(MNB, T, features)
    return pr / (1 + pr)
 end

 function predictratio(MNB::MultinomialNaiveBayes, T, features)

    #Initialize probability ratio with prior
    pr = T(MNB.prior_ratio)

    for (j, x) in enumerate(features)
        if x != 0
            pr *= (T(MNB.feature_ratios[j]))^x
        end
    end

    return pr
 end
    
 """
    predictlogratio(MNB, [T,] features) -> log_ratio
    
 predicts the log of the ratio of probabilities that the label is
 `true` or `false` from the `MultinomialNaiveBayes` classifier `MNB`
 and the `features`.

 The optional numeric type `T` specifies the working type to calculate
 the return probabilities, which can differ from the numeric type used
 to train the `MNB` model.

 This function will be slower and restricted to fewer numeric types, but
 can offer better precision when extremely small or large ratios of
 conditional probabilities exist.
 """
 function predictlogratio(MNB::MultinomialNaiveBayes, T, features)
    
    logpr = T(log(MNB.prior_ratio))

    for (j, x) in enumerate(features)
        if x != 0
            logpr += x * T(log(MNB.feature_ratios[j]))
        end
    end

    return logpr
 end

 #The default working numeric type is the type used to train the classifier
 for f in (:predict, :predictlog, :predictlogratio) @eval begin
    $f(MNB::MultinomialNaiveBayes{T}, features) where T =
        $f(MNB, T, features)
 end end

 ##############################################################################
 # A simple test
 ##############################################################################

 #Example from
 #https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html

 using SparseArrays

 featureset = SparseMatrixCSC(
 #Chinese Beijing Shanghai Macao Japan Tokyo
 [2 1 0 0 0 0
 2 0 1 0 0 0
 1 0 0 1 0 0
 1 0 0 0 1 1])

 labels = BitVector([true, true, true, false])

 MNB = fit(MultinomialNaiveBayes, Rational, featureset, labels, 1)

 upr_yes = 3//4 * (3//7)^3 * 1//14 * 1//14
 upr_no  = 1//4 * (2//9)^3 * 2//9  * 2//9
 pr_yes = upr_yes // (upr_yes + upr_no)

 testfeat = SparseVector([3, 0, 0, 0, 1, 1])
 pred_yes = predict(MNB, Rational, testfeat)

 @assert pred_yes == pr_yes

 r = predictratio(MNB, Float32, testfeat)
 lr = predictlogratio(MNB, Float32, testfeat)
 @assert log(r) ≈ lr
 @assert r/(1+r) ≈ pred_yes
	struct MultinomialNaiveBayes{T, V<:AbstractVector}
	feature_ratios::V
	prior_ratio::T
	end

	"""
	fit(MultinomialNaiveBayes, [T,] features, labels, α = 1) -> MNB

	fits a `MultinomialNaiveBayes` classifier `MNB` using the
	`features` matrix and `labels` vector of `Bool`s.

	The number of columns in `features` is assumed to be the same as the
	length of `labels`.

	`T` is the working numeric type, which defaults to `Float32`.
	Other numeric types like `T = Rational` are possible, which permit
	testing exact values of conditional probabilities.

	`α` is a additive smoothing parameter used for regularization,
	with 1 being Laplace smoothing, 0.5 being Jeffrys smoothing,
	and other values being the more general Lidstone smoothing.
	"""
	function fit(::Type{MultinomialNaiveBayes{T, S}}, features, labels, α = 1) where
	{T<:Real, S<:AbstractVector{T}}

	m, n = size(features)
	feature_ratios = zeros(T, n) #probability ratios

	#Calculate prior ratio
	counts = zeros(Int, 2)
	counts_category = zeros(Int, 2)
	for i in 1:m
	k = if labels[i] 1 else 2 end
	counts[k] += 1
	counts_category[k] += sum(view(features, i, :))
	end
	prior_ratio = T(counts[1]) / T(counts[2])

	#Calculate feature ratios
	counts = zeros(Int, 2)
	for j in 1:n
	counts[1] = counts[2] = 0
	for i in 1:m
	k = if labels[i] 1 else 2 end
	counts[k] += features[i, j]
	end

	feature_ratios[j] = (T(counts[1] + α) / T(counts_category[1] + α * n)) /
	(T(counts[2] + α) / T(counts_category[2] + α * n))
	end

	return MultinomialNaiveBayes{T, S}(feature_ratios, prior_ratio)
	end

	#Default working numeric type
	fit(::Type{MultinomialNaiveBayes}, features, labels, α = 1) =
	fit(MultinomialNaiveBayes, Float32, features, labels, α)

	#Expand out working numeric type into parameters of MultinomialNaiveBayes
	fit(::Type{MultinomialNaiveBayes}, T::Type{R}, features, labels, α = 1) where R<:Real =
	fit(MultinomialNaiveBayes{T, Vector{T}}, features, labels, α)



	"""
	predict(MNB, [T,] features) -> pr_true

	predicts the probabilities of the label being `true` or `false`
	from the `MultinomialNaiveBayes` classifier `MNB` and the `features`.

	The optional numeric type `T` specifies the working type to calculate
	the return probabilities, which can differ from the numeric type used
	to train the `MNB` model.
	"""
	function predict(MNB::MultinomialNaiveBayes, T, features)
	pr = predictratio(MNB, T, features)
	return pr / (1 + pr)
	end

	function predictratio(MNB::MultinomialNaiveBayes, T, features)

	#Initialize probability ratio with prior
	pr = T(MNB.prior_ratio)

	for (j, x) in enumerate(features)
	if x != 0
	pr *= (T(MNB.feature_ratios[j]))^x
	end
	end

	return pr
	end

	"""
	predictlogratio(MNB, [T,] features) -> log_ratio

	predicts the log of the ratio of probabilities that the label is
	`true` or `false` from the `MultinomialNaiveBayes` classifier `MNB`
	and the `features`.

	The optional numeric type `T` specifies the working type to calculate
	the return probabilities, which can differ from the numeric type used
	to train the `MNB` model.

	This function will be slower and restricted to fewer numeric types, but
	can offer better precision when extremely small or large ratios of
	conditional probabilities exist.
	"""
	function predictlogratio(MNB::MultinomialNaiveBayes, T, features)

	logpr = T(log(MNB.prior_ratio))

	for (j, x) in enumerate(features)
	if x != 0
	logpr += x * T(log(MNB.feature_ratios[j]))
	end
	end

	return logpr
	end

	#The default working numeric type is the type used to train the classifier
	for f in (:predict, :predictlog, :predictlogratio) @eval begin
	$f(MNB::MultinomialNaiveBayes{T}, features) where T =
	$f(MNB, T, features)
	end end

	##############################################################################
	# A simple test
	##############################################################################

	#Example from
	#https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html

	using SparseArrays

	featureset = SparseMatrixCSC(
	#Chinese Beijing Shanghai Macao Japan Tokyo
	[2 1 0 0 0 0
	2 0 1 0 0 0
	1 0 0 1 0 0
	1 0 0 0 1 1])

	labels = BitVector([true, true, true, false])

	MNB = fit(MultinomialNaiveBayes, Rational, featureset, labels, 1)

	upr_yes = 3//4 * (3//7)^3 * 1//14 * 1//14
	upr_no = 1//4 * (2//9)^3 * 2//9 * 2//9
	pr_yes = upr_yes // (upr_yes + upr_no)

	testfeat = SparseVector([3, 0, 0, 0, 1, 1])
	pred_yes = predict(MNB, Rational, testfeat)

	@assert pred_yes == pr_yes

	r = predictratio(MNB, Float32, testfeat)
	lr = predictlogratio(MNB, Float32, testfeat)
	@assert log(r) ≈ lr
	@assert r/(1+r) ≈ pred_yes