Package cc.mallet.topics
Class PolylingualTopicModel
- java.lang.Object
-
- cc.mallet.topics.PolylingualTopicModel
-
- All Implemented Interfaces:
java.io.Serializable
public class PolylingualTopicModel extends java.lang.Object implements java.io.Serializable
Latent Dirichlet Allocation for loosely parallel corpora in arbitrary languages- Author:
- David Mimno, Andrew McCallum
- See Also:
- Serialized Form
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description class
PolylingualTopicModel.TopicAssignment
-
Field Summary
Fields Modifier and Type Field Description protected double[]
alpha
protected Alphabet[]
alphabets
protected double
alphaSum
protected double[]
betas
protected double[]
betaSums
int
burninPeriod
protected java.util.ArrayList<PolylingualTopicModel.TopicAssignment>
data
static double
DEFAULT_BETA
protected int[]
docLengthCounts
protected java.text.NumberFormat
formatter
protected int
iterationsSoFar
protected double[][]
languageCachedCoefficients
protected int[]
languageMaxTypeCounts
protected double[]
languageSmoothingOnlyMasses
protected int[][]
languageTokensPerTopic
protected int[][][]
languageTypeTopicCounts
protected java.lang.String
modelFilename
int
numIterations
protected int
numStopwords
protected int
numTopics
protected int[]
oneDocTopicCounts
int
optimizeInterval
protected boolean
printLogLikelihood
protected Randoms
random
protected int
saveModelInterval
int
saveSampleInterval
protected int
saveStateInterval
int
showTopicsInterval
protected java.lang.String
stateFilename
protected LabelAlphabet
topicAlphabet
protected int
topicBits
protected int[][]
topicDocCounts
protected int
topicMask
protected int[]
vocabularySizes
int
wordsPerTopic
-
Constructor Summary
Constructors Constructor Description PolylingualTopicModel(int numberOfTopics)
PolylingualTopicModel(int numberOfTopics, double alphaSum)
PolylingualTopicModel(int numberOfTopics, double alphaSum, Randoms random)
PolylingualTopicModel(LabelAlphabet topicAlphabet, double alphaSum, Randoms random)
-
Method Summary
All Methods Static Methods Instance Methods Concrete Methods Modifier and Type Method Description void
addInstances(InstanceList[] training)
void
estimate()
void
estimate(int iterationsThisRound)
java.util.ArrayList<PolylingualTopicModel.TopicAssignment>
getData()
TopicInferencer
getInferencer(int language)
Return a tool for estimating topic distributions for new documentsint
getNumTopics()
MarginalProbEstimator
getProbEstimator(int language)
Return a tool for estimating topic distributions for new documentsLabelAlphabet
getTopicAlphabet()
void
loadTestingIDs(java.io.File testingIDFile)
static void
main(java.lang.String[] args)
double
modelLogLikelihood()
void
optimizeBetas()
void
printDocumentTopics(java.io.File f)
void
printDocumentTopics(java.io.PrintWriter pw)
void
printDocumentTopics(java.io.PrintWriter pw, double threshold, int max)
void
printState(java.io.File f)
void
printState(java.io.PrintStream out)
void
printTopWords(java.io.File file, int numWords, boolean useNewLines)
void
printTopWords(java.io.PrintStream out, int numWords, boolean usingNewLines)
static PolylingualTopicModel
read(java.io.File f)
protected void
sampleTopicsForOneDoc(PolylingualTopicModel.TopicAssignment topicAssignment, boolean shouldSaveState)
void
setBurninPeriod(int burninPeriod)
void
setModelOutput(int interval, java.lang.String filename)
void
setNumIterations(int numIterations)
void
setOptimizeInterval(int interval)
void
setRandomSeed(int seed)
void
setSaveState(int interval, java.lang.String filename)
Define how often and where to save the statevoid
setTopicDisplay(int interval, int n)
void
write(java.io.File serializedModelFile)
-
-
-
Field Detail
-
data
protected java.util.ArrayList<PolylingualTopicModel.TopicAssignment> data
-
topicAlphabet
protected LabelAlphabet topicAlphabet
-
numStopwords
protected int numStopwords
-
numTopics
protected int numTopics
-
topicMask
protected int topicMask
-
topicBits
protected int topicBits
-
alphabets
protected Alphabet[] alphabets
-
vocabularySizes
protected int[] vocabularySizes
-
alpha
protected double[] alpha
-
alphaSum
protected double alphaSum
-
betas
protected double[] betas
-
betaSums
protected double[] betaSums
-
languageMaxTypeCounts
protected int[] languageMaxTypeCounts
-
DEFAULT_BETA
public static final double DEFAULT_BETA
- See Also:
- Constant Field Values
-
languageSmoothingOnlyMasses
protected double[] languageSmoothingOnlyMasses
-
languageCachedCoefficients
protected double[][] languageCachedCoefficients
-
oneDocTopicCounts
protected int[] oneDocTopicCounts
-
languageTypeTopicCounts
protected int[][][] languageTypeTopicCounts
-
languageTokensPerTopic
protected int[][] languageTokensPerTopic
-
docLengthCounts
protected int[] docLengthCounts
-
topicDocCounts
protected int[][] topicDocCounts
-
iterationsSoFar
protected int iterationsSoFar
-
numIterations
public int numIterations
-
burninPeriod
public int burninPeriod
-
saveSampleInterval
public int saveSampleInterval
-
optimizeInterval
public int optimizeInterval
-
showTopicsInterval
public int showTopicsInterval
-
wordsPerTopic
public int wordsPerTopic
-
saveModelInterval
protected int saveModelInterval
-
modelFilename
protected java.lang.String modelFilename
-
saveStateInterval
protected int saveStateInterval
-
stateFilename
protected java.lang.String stateFilename
-
random
protected Randoms random
-
formatter
protected java.text.NumberFormat formatter
-
printLogLikelihood
protected boolean printLogLikelihood
-
-
Constructor Detail
-
PolylingualTopicModel
public PolylingualTopicModel(int numberOfTopics)
-
PolylingualTopicModel
public PolylingualTopicModel(int numberOfTopics, double alphaSum)
-
PolylingualTopicModel
public PolylingualTopicModel(int numberOfTopics, double alphaSum, Randoms random)
-
PolylingualTopicModel
public PolylingualTopicModel(LabelAlphabet topicAlphabet, double alphaSum, Randoms random)
-
-
Method Detail
-
loadTestingIDs
public void loadTestingIDs(java.io.File testingIDFile) throws java.io.IOException
- Throws:
java.io.IOException
-
getTopicAlphabet
public LabelAlphabet getTopicAlphabet()
-
getNumTopics
public int getNumTopics()
-
getData
public java.util.ArrayList<PolylingualTopicModel.TopicAssignment> getData()
-
setNumIterations
public void setNumIterations(int numIterations)
-
setBurninPeriod
public void setBurninPeriod(int burninPeriod)
-
setTopicDisplay
public void setTopicDisplay(int interval, int n)
-
setRandomSeed
public void setRandomSeed(int seed)
-
setOptimizeInterval
public void setOptimizeInterval(int interval)
-
setModelOutput
public void setModelOutput(int interval, java.lang.String filename)
-
setSaveState
public void setSaveState(int interval, java.lang.String filename)
Define how often and where to save the state- Parameters:
interval
- Save a copy of the state everyinterval
iterations.filename
- Save the state to this file, with the iteration number as a suffix
-
addInstances
public void addInstances(InstanceList[] training)
-
estimate
public void estimate() throws java.io.IOException
- Throws:
java.io.IOException
-
estimate
public void estimate(int iterationsThisRound) throws java.io.IOException
- Throws:
java.io.IOException
-
optimizeBetas
public void optimizeBetas()
-
sampleTopicsForOneDoc
protected void sampleTopicsForOneDoc(PolylingualTopicModel.TopicAssignment topicAssignment, boolean shouldSaveState)
-
printTopWords
public void printTopWords(java.io.File file, int numWords, boolean useNewLines) throws java.io.IOException
- Throws:
java.io.IOException
-
printTopWords
public void printTopWords(java.io.PrintStream out, int numWords, boolean usingNewLines)
-
printDocumentTopics
public void printDocumentTopics(java.io.File f) throws java.io.IOException
- Throws:
java.io.IOException
-
printDocumentTopics
public void printDocumentTopics(java.io.PrintWriter pw)
-
printDocumentTopics
public void printDocumentTopics(java.io.PrintWriter pw, double threshold, int max)
- Parameters:
pw
- A print writerthreshold
- Only print topics with proportion greater than this numbermax
- Print no more than this many topics
-
printState
public void printState(java.io.File f) throws java.io.IOException
- Throws:
java.io.IOException
-
printState
public void printState(java.io.PrintStream out)
-
modelLogLikelihood
public double modelLogLikelihood()
-
getInferencer
public TopicInferencer getInferencer(int language)
Return a tool for estimating topic distributions for new documents
-
getProbEstimator
public MarginalProbEstimator getProbEstimator(int language)
Return a tool for estimating topic distributions for new documents
-
write
public void write(java.io.File serializedModelFile)
-
read
public static PolylingualTopicModel read(java.io.File f) throws java.lang.Exception
- Throws:
java.lang.Exception
-
main
public static void main(java.lang.String[] args) throws java.io.IOException
- Throws:
java.io.IOException
-
-