import java.util.ArrayList; import java.util.HashSet; import com.decontextualize.a2z.TextFilter; public class BayesClassifier extends TextFilter { public static void main(String[] args) { BayesClassifier bc = new BayesClassifier(); for (int i = 0; i < args.length; i++) { bc.addCategory(args[i]); } bc.run(); } private ArrayList categories = new ArrayList(); private HashSet uniqueWords = new HashSet(); // add a category, training it with words from a particular file public void addCategory(String fname) { BayesCategory cat = new BayesCategory(fname); String[] lines = new TextFilter().collectLines(fromFile(fname)); for (String line: lines) { String[] tokens = line.split(" "); for (String token: tokens) { cat.train(token); } } categories.add(cat); } public void eachLine(String line) { String[] tokens = line.split(" "); for (String token: tokens) { uniqueWords.add(token); } } public void end() { // calculate total number of words in all categories (needed for bayes // formula) int categoryWordTotal = 0; for (BayesCategory bcat: categories) { categoryWordTotal += bcat.getTotal(); } // print out scores for each category (higher is better) double highest = -100000000; String winner = ""; for (BayesCategory bcat: categories) { double score = bcat.score(uniqueWords, categoryWordTotal); if (score>highest) { winner = bcat.getName(); highest=score; } } // the following loop will print out the "relevance" of each word in // assigning the text to a particular category (for long texts, will // produce a lot of output) double highestProb = 0; String deadGiveAway = ""; for (String word: uniqueWords) { for (BayesCategory bcat: categories) { double wordProb = bcat.relevance(word, categories); if (bcat.getName().equals(winner)) { if (wordProb>highestProb) { highestProb = wordProb; deadGiveAway = word; } } } } if (deadGiveAway.length()<3) println("Sorry, I'm not sure what language this text is written in."); else { println("This text is definitely written in " + winner + ". " + "The word \"" + deadGiveAway + "\" gave it away.\n\n"); } } }