import java.util.HashMap; import java.util.HashSet; import java.util.ArrayList; public class BayesCategory { private HashMap count = new HashMap(); private String name; private int total = 0; public BayesCategory(String name_) { name = name_.replaceAll("\\.txt",""); } public int getTotal() { return total; } public String getName() { return name; } // "train" this category with the given word: keeps track of how many times // the word occurs, and how many words have been counted public void train(String word) { if (count.containsKey(word)) { count.put(word, count.get(word) + 1); } else { count.put(word, 1); } total++; } // Returns the frequency of the given word in the category text as a // percentage. Gives a small non-zero number for words that don't exist in // the category text. public double percentage(String word) { if (count.containsKey(word)) { return count.get(word) / (double)total; } else { return 0.01 / (double)total; } } // Returns the "relevance" of a particular word in assigning a text to // this category, a number between zero and one: ~0.999 means that the // given word *only* occurs in this category, while ~0.001 means that the // given word *never* occurs in this category (but does occur in others). // (This is what Paul Graham calls the "probability.") Requires an arraylist // of all other categories. public double relevance(String word, ArrayList categories) { double percentageSum = 0; for (BayesCategory bcat: categories) { percentageSum += bcat.percentage(word); } return percentage(word) / percentageSum; } // Returns a "score" for the given set of tokens. Score is calculated as // the product of the frequency of each word in the source text // as it occurs in the category text, multiplied by the probability that any // given token will occur in this category. Requires the total number of // tokens in all category texts as a second parameter. public double score(HashSet src, int totalAll) { double scoreVal = 0; // note: we're taking the sum of logs, rather than the products. // log(a * b) == log(a) + log(b) // also, this gives us more manageable numbers for (String word: src) { double percent = percentage(word); scoreVal += Math.log(percent); } scoreVal += Math.log(getTotal() / (double)totalAll); return scoreVal; } }