说明
java unknownwordmodel示例是从最受好评的开源项目中提取的实现代码,你可以参考下面示例的使用方式。
编程语言: Java
类/类型: UnknownWordModel
示例#1文件:
BaseLexicon.java项目:
rodolfopc/ufmg-nlp
/** Adds the tagging with count to the data structures in this Lexicon. */
protected void addTagging(boolean seen, IntTaggedWord itw, double count) {
if (seen) {
seenCounter.incrementCount(itw, count);
if (itw.tag() == nullTag) {
words.add(itw);
} else if (itw.word() == nullWord) {
tags.add(itw);
} else {
// rules.add(itw);
}
} else {
uwModel.addTagging(seen, itw, count);
// if (itw.tag() == nullTag) {
// sigs.add(itw);
// }
}
}
示例#2文件:
EnglishUnknownWordModelTrainer.java项目:
alishir/CoreNLP
/** Trains this UWM on the Collection of trees. */
public void train(TaggedWord tw, int loc, double weight) {
IntTaggedWord iTW = new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);
IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);
seenCounter.incrementCount(iW, weight);
IntTaggedWord i = NULL_ITW;
if (treesRead > indexToStartUnkCounting) {
// start doing this once some way through trees;
// treesRead is 1 based counting
if (seenCounter.getCount(iW) < 1.5) {
// it's an entirely unknown word
int s = model.getSignatureIndex(iTW.word, loc, wordIndex.get(iTW.word));
if (DOCUMENT_UNKNOWNS) {
String wStr = wordIndex.get(iTW.word);
String tStr = tagIndex.get(iTW.tag);
String sStr = wordIndex.get(s);
EncodingPrintWriter.err.println(
"Unknown word/tag/sig:\t" + wStr + '\t' + tStr + '\t' + sStr, "UTF-8");
}
IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
IntTaggedWord iS = new IntTaggedWord(s, nullTag);
unSeenCounter.incrementCount(iTS, weight);
unSeenCounter.incrementCount(iT, weight);
unSeenCounter.incrementCount(iS, weight);
unSeenCounter.incrementCount(i, weight);
// rules.add(iTS);
// sigs.add(iS);
} // else {
// if (seenCounter.getCount(iTW) < 2) {
// it's a new tag for a known word
// do nothing for now
// }
// }
}
}
示例#3文件:
BaseLexicon.java项目:
rodolfopc/ufmg-nlp
/** Print some statistics about this lexicon. */
public void printLexStats() {
System.out.println("BaseLexicon statistics");
System.out.println("unknownLevel is " + getUnknownWordModel().getUnknownLevel());
// System.out.println("Rules size: " + rules.size());
System.out.println("Sum of rulesWithWord: " + numRules());
System.out.println("Tags size: " + tags.size());
int wsize = words.size();
System.out.println("Words size: " + wsize);
// System.out.println("Unseen Sigs size: " + sigs.size() +
// " [number of unknown equivalence classes]");
System.out.println(
"rulesWithWord length: "
+ rulesWithWord.length
+ " [should be sum of words + unknown sigs]");
int[] lengths = new int[STATS_BINS];
ArrayList<String>[] wArr = new ArrayList[STATS_BINS];
for (int j = 0; j < STATS_BINS; j++) {
wArr[j] = new ArrayList<String>();
}
for (int i = 0; i < rulesWithWord.length; i++) {
int num = rulesWithWord[i].size();
if (num > STATS_BINS - 1) {
num = STATS_BINS - 1;
}
lengths[num]++;
if (wsize <= 20 || num >= STATS_BINS / 2) {
wArr[num].add(wordIndex.get(i));
}
}
System.out.println("Stats on how many taggings for how many words");
for (int j = 0; j < STATS_BINS; j++) {
System.out.print(j + " taggings: " + lengths[j] + " words ");
if (wsize <= 20 || j >= STATS_BINS / 2) {
System.out.print(wArr[j]);
}
System.out.println();
}
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(0);
System.out.println("Unseen counter: " + Counters.toString(uwModel.unSeenCounter(), nf));
if (wsize < 50 && tags.size() < 10) {
nf.setMaximumFractionDigits(3);
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
pw.println("Tagging probabilities log P(word|tag)");
for (int t = 0; t < tags.size(); t++) {
pw.print('\t');
pw.print(tagIndex.get(t));
}
pw.println();
for (int w = 0; w < wsize; w++) {
pw.print(wordIndex.get(w));
pw.print('\t');
for (int t = 0; t < tags.size(); t++) {
IntTaggedWord iTW = new IntTaggedWord(w, t);
pw.print(nf.format(score(iTW, 1, wordIndex.get(w))));
if (t == tags.size() - 1) {
pw.println();
} else pw.print('\t');
}
}
pw.close();
System.out.println(sw.toString());
}
}
示例#4文件:
BaseLexicon.java项目:
rodolfopc/ufmg-nlp
/**
* Get the score of this word with this tag (as an IntTaggedWord) at this location. (Presumably an
* estimate of P(word | tag).)
*
* <p><i>Implementation documentation:</i> Seen: c_W = count(W) c_TW = count(T,W) c_T = count(T)
* c_Tunseen = count(T) among new words in 2nd half total = count(seen words) totalUnseen =
* count("unseen" words) p_T_U = Pmle(T|"unseen") pb_T_W = P(T|W). If (c_W >
* smoothInUnknownsThreshold) = c_TW/c_W Else (if not smart mutation) pb_T_W = bayes prior
* smooth[1] with p_T_U p_T= Pmle(T) p_W = Pmle(W) pb_W_T = log(pb_T_W * p_W / p_T) [Bayes rule]
* Note that this doesn't really properly reserve mass to unknowns.
*
* <p>Unseen: c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen) c_U = totalUnseen
* above p_T_U = Pmle(T|Unseen) pb_T_S = Bayes smooth of Pmle(T|S) with P(T|Unseen) [smooth[0]]
* pb_W_T = log(P(W|T)) inverted
*
* @param iTW An IntTaggedWord pairing a word and POS tag
* @param loc The position in the sentence. <i>In the default implementation this is used only for
* unknown words to change their probability distribution when sentence initial</i>
* @return A float score, usually, log P(word|tag)
*/
public float score(IntTaggedWord iTW, int loc, String word) {
// both actual
double c_TW = seenCounter.getCount(iTW);
// double x_TW = xferCounter.getCount(iTW);
IntTaggedWord temp = new IntTaggedWord(iTW.word, nullTag);
// word counts
double c_W = seenCounter.getCount(temp);
// double x_W = xferCounter.getCount(temp);
// totals
double total = seenCounter.getCount(NULL_ITW);
double totalUnseen = uwModel.unSeenCounter().getCount(NULL_ITW);
temp = new IntTaggedWord(nullWord, iTW.tag);
// tag counts
double c_T = seenCounter.getCount(temp);
double c_Tunseen = uwModel.unSeenCounter().getCount(temp);
double pb_W_T; // always set below
if (DEBUG_LEXICON) {
// dump info about last word
if (iTW.word != debugLastWord) {
if (debugLastWord >= 0 && debugPrefix != null) {
// the 2nd conjunct in test above handles older serialized files
EncodingPrintWriter.err.println(debugPrefix + debugProbs + debugNoProbs, "UTF-8");
}
}
}
boolean seen = (c_W > 0.0);
if (seen) {
// known word model for P(T|W)
if (DEBUG_LEXICON_SCORE) {
System.err.println(
"Lexicon.score "
+ wordIndex.get(iTW.word)
+ "/"
+ tagIndex.get(iTW.tag)
+ " as known word.");
}
// c_TW = Math.sqrt(c_TW); [cdm: funny math scaling? dunno who played with this]
// c_TW += 0.5;
double p_T_U;
if (useSignatureForKnownSmoothing) { // only works for English currently
p_T_U = getUnknownWordModel().scoreProbTagGivenWordSignature(iTW, loc, smooth[0], word);
if (DEBUG_LEXICON_SCORE)
System.err.println(
"With useSignatureForKnownSmoothing, P(T|U) is "
+ p_T_U
+ " rather than "
+ (c_Tunseen / totalUnseen));
} else {
p_T_U = c_Tunseen / totalUnseen;
}
double pb_T_W; // always set below
if (DEBUG_LEXICON_SCORE) {
System.err.println(
"c_W is "
+ c_W
+ " mle = "
+ (c_TW / c_W)
+ " smoothInUnknownsThresh is "
+ smoothInUnknownsThreshold
+ " base p_T_U is "
+ c_Tunseen
+ "/"
+ totalUnseen
+ " = "
+ p_T_U);
}
if (c_W > smoothInUnknownsThreshold && c_TW > 0.0 && c_W > 0.0) {
// we've seen the word enough times to have confidence in its tagging
pb_T_W = c_TW / c_W;
} else {
// we haven't seen the word enough times to have confidence in its
// tagging
if (smartMutation) {
int numTags = tagIndex.size();
if (m_TT == null || numTags != m_T.length) {
buildPT_T();
}
p_T_U *= 0.1;
// System.out.println("Checking "+iTW);
for (int t = 0; t < numTags; t++) {
IntTaggedWord iTW2 = new IntTaggedWord(iTW.word, t);
double p_T_W2 = seenCounter.getCount(iTW2) / c_W;
if (p_T_W2 > 0) {
// System.out.println(" Observation of "+tagIndex.get(t)+"
// ("+seenCounter.getCount(iTW2)+") mutated to
// "+tagIndex.get(iTW.tag)+" at rate
// "+(m_TT[tag][t]/m_T[t]));
p_T_U += p_T_W2 * m_TT[iTW.tag][t] / m_T[t] * 0.9;
}
}
}
if (DEBUG_LEXICON_SCORE) {
System.err.println("c_TW = " + c_TW + " c_W = " + c_W + " p_T_U = " + p_T_U);
}
// double pb_T_W = (c_TW+smooth[1]*x_TW)/(c_W+smooth[1]*x_W);
pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]);
}
double p_T = (c_T / total);
double p_W = (c_W / total);
pb_W_T = Math.log(pb_T_W * p_W / p_T);
if (DEBUG_LEXICON) {
if (iTW.word != debugLastWord) {
debugLastWord = iTW.word;
debugLoc = loc;
debugProbs = new StringBuilder();
debugNoProbs = new StringBuilder("impossible: ");
debugPrefix = "Lexicon: " + wordIndex.get(debugLastWord) + " (known): ";
}
if (pb_W_T > Double.NEGATIVE_INFINITY) {
NumberFormat nf = NumberFormat.getNumberInstance();
nf.setMaximumFractionDigits(3);
debugProbs.append(
tagIndex.get(iTW.tag)
+ ": cTW="
+ c_TW
+ " c_T="
+ c_T
+ " pb_T_W="
+ nf.format(pb_T_W)
+ " log pb_W_T="
+ nf.format(pb_W_T)
+ ", ");
// debugProbs.append("\n" + "smartMutation=" + smartMutation + "
// smoothInUnknownsThreshold=" + smoothInUnknownsThreshold + "
// smooth0=" + smooth[0] + "smooth1=" + smooth[1] + " p_T_U=" + p_T_U
// + " c_W=" + c_W);
} else {
debugNoProbs.append(tagIndex.get(iTW.tag)).append(' ');
}
} // end if (DEBUG_LEXICON)
} else { // when unseen
if (loc >= 0) {
pb_W_T = getUnknownWordModel().score(iTW, loc, c_T, total, smooth[0], word);
} else {
// For negative we now do a weighted average for the dependency grammar :-)
double pb_W0_T = getUnknownWordModel().score(iTW, 0, c_T, total, smooth[0], word);
double pb_W1_T = getUnknownWordModel().score(iTW, 1, c_T, total, smooth[0], word);
pb_W_T = Math.log((Math.exp(pb_W0_T) + 2 * Math.exp(pb_W1_T)) / 3);
}
}
// Categorical cutoff if score is too low
if (pb_W_T > -100.0) {
return (float) pb_W_T;
}
return Float.NEGATIVE_INFINITY;
} // end score()
示例#5文件:
BaseLexicon.java项目:
rodolfopc/ufmg-nlp
protected void initRulesWithWord() {
if (testOptions.verbose || DEBUG_LEXICON) {
System.err.print("\nInitializing lexicon scores ... ");
}
// int numWords = words.size()+sigs.size()+1;
int unkWord = wordIndex.indexOf(UNKNOWN_WORD, true);
int numWords = wordIndex.size();
rulesWithWord = new List[numWords];
for (int w = 0; w < numWords; w++) {
rulesWithWord[w] = new ArrayList<IntTaggedWord>(1); // most have 1 or 2
// items in them
}
// for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) {
tags = new HashSet<IntTaggedWord>();
for (IntTaggedWord iTW : seenCounter.keySet()) {
if (iTW.word() == nullWord && iTW.tag() != nullTag) {
tags.add(iTW);
}
}
// tags for unknown words
if (DEBUG_LEXICON) {
System.err.println(
"Lexicon initializing tags for UNKNOWN WORD ("
+ Lexicon.UNKNOWN_WORD
+ ", "
+ unkWord
+ ')');
}
if (DEBUG_LEXICON) System.err.println("unSeenCounter is: " + uwModel.unSeenCounter());
if (DEBUG_LEXICON)
System.err.println(
"Train.openClassTypesThreshold is " + trainOptions.openClassTypesThreshold);
for (IntTaggedWord iT : tags) {
if (DEBUG_LEXICON)
System.err.println("Entry for " + iT + " is " + uwModel.unSeenCounter().getCount(iT));
double types = uwModel.unSeenCounter().getCount(iT);
if (types > trainOptions.openClassTypesThreshold) {
// Number of types before it's treated as open class
IntTaggedWord iTW = new IntTaggedWord(unkWord, iT.tag);
rulesWithWord[iTW.word].add(iTW);
}
}
if (testOptions.verbose || DEBUG_LEXICON) {
System.err.print("The " + rulesWithWord[unkWord].size() + " open class tags are: [");
for (IntTaggedWord item : rulesWithWord[unkWord]) {
System.err.print(" " + tagIndex.get(item.tag()));
if (DEBUG_LEXICON) {
IntTaggedWord iTprint = new IntTaggedWord(nullWord, item.tag);
System.err.print(
" (tag "
+ item.tag()
+ ", type count is "
+ uwModel.unSeenCounter().getCount(iTprint)
+ ')');
}
}
System.err.println(" ] ");
}
for (IntTaggedWord iTW : seenCounter.keySet()) {
if (iTW.tag() != nullTag && iTW.word() != nullWord) {
rulesWithWord[iTW.word].add(iTW);
}
}
}