Skip to content

Commit

Permalink
added language detection interface, and updated language detection im…
Browse files Browse the repository at this point in the history
…plementation. new version 1.3.6
  • Loading branch information
isanvicente committed Jul 12, 2018
1 parent 8c7bedf commit d388f05
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 8 deletions.
6 changes: 3 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
Expand Down Expand Up @@ -103,7 +103,7 @@
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.36</version>
<version>5.1.46</version>
</dependency>
<dependency>
<groupId>com.robbypond</groupId>
Expand Down Expand Up @@ -198,5 +198,5 @@
</plugins>
</build>
<url>https://github.com/Elhuyar/MSM.git</url>
<version>1.3.5</version>
<version>1.3.6</version>
</project>
148 changes: 147 additions & 1 deletion src/main/java/elh/eus/MSM/CLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,31 @@
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import javax.naming.NamingException;

import org.apache.commons.io.FileUtils;

import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileBuilder;
import com.optimaize.langdetect.profiles.LanguageProfileWriter;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;

import net.sourceforge.argparse4j.ArgumentParsers;
import net.sourceforge.argparse4j.impl.Arguments;
import net.sourceforge.argparse4j.inf.ArgumentParser;
Expand Down Expand Up @@ -98,6 +114,13 @@ public class CLI {
*/
private Subparser influenceTaggerParser;


/**
* The parser that manages the language detection sub-command.
*/
private Subparser langDetectParser;


/**
* The parser that manages the twitter user info sub-command.
*/
Expand All @@ -123,6 +146,8 @@ public CLI() {
loadInfluenceTaggerParameters();
twitterUserParser = subParsers.addParser("twtUser").help("Twitter user info CLI");
loadTwitterUserInfoParameters();
langDetectParser = subParsers.addParser("langid").help("language Detection CLI");
loadLangDetectParameters();
userLocationGeocoderParser = subParsers.addParser("geocode").help("Geocoder for twitter user locations info CLI");
loadUserLocationGeocoderParameters();

Expand Down Expand Up @@ -220,11 +245,14 @@ else if (args[0].equals("geocode")) {
else if (args[0].equals("twtUser")) {
twtUserUInfo();
}
else if (args[0].equals("langid")) {
langDetect();
}

} catch (ArgumentParserException e) {
argParser.handleError(e);
System.out.println("Run java -jar target/MSM-" + version
+ ".jar (twitter|feed|influence|twtUser|geocode) -help for details");
+ ".jar (twitter|feed|influence|twtUser|langid|geocode) -help for details");
System.exit(1);
}
}
Expand Down Expand Up @@ -544,6 +572,84 @@ public final void twtUserUInfo()
}
}

public final void langDetect()
{
String strings = parsedArguments.getString("strings");
String langs = parsedArguments.getString("langs");
String type = parsedArguments.getString("type");

boolean twitterlangs = parsedArguments.getBoolean("twitterLangid");
boolean train = parsedArguments.getBoolean("train");
boolean allLangprofs = parsedArguments.getBoolean("onlySpecificLanguageProfiles");




List<String> acceptedLangs = Arrays.asList(langs.split(","));

LangDetect lid;
if (allLangprofs) {
lid = new LangDetect(acceptedLangs);
}
else {
lid = new LangDetect();
}
String input = strings;
String lang = "unk";
if (MSMUtils.checkFile(strings))
{
try {
input = FileUtils.readFileToString(new File(strings), StandardCharsets.UTF_8);
} catch (IOException e) {
System.err.println("MSM::langDetect - ERROR when reading from file.");
}
}

if (train && input.equals(strings))
{
//create text object factory:
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexingCleanText();

//load your training text:
TextObject inputText = textObjectFactory.create()
.append(input);


//create the profile:
LanguageProfile languageProfile = new LanguageProfileBuilder(LdLocale.fromString(langs))
.ngramExtractor(NgramExtractors.standard())
.minimalFrequency(5) //adjust please
.addText(inputText)
.build();

//store it to disk if you like:
try {
new LanguageProfileWriter().write(languageProfile, new FileOutputStream(new File(input+"_"+langs+"_ldprofile")));
} catch (IOException e) {
System.err.println("MSM::langDetect - ERROR when writing language profile to file "+input+"_"+langs+"_ldprofile .");
System.exit(1);
}
}
else if (train)
{
System.err.println("MSM::langDetect - ERROR: train activated but invalid file given. Exiting now");
System.exit(1);
}

switch (type)
{
case "twitter":
lang = lid.detectTwtLanguage(input,langs);
break;
default:
lang = lid.detectFeedLanguage(input,langs);
}

System.out.println(input+"\n"+lang+" - probs: "+lid.probabilities(input)+"\n-------------------------------------------");
}




public final void loadTwitterCrawlerParameters()
{
Expand Down Expand Up @@ -743,6 +849,46 @@ public final void loadUserLocationGeocoderParameters()
}


private void loadLangDetectParameters()
{
langDetectParser.addArgument("-s", "--strings")
.required(true)
.help("string to look for its language, or file containing strings. The language detection unit is the whole file"
+ "Many locations may be introduced separated by '::' string (semicolon may be used inside the location string, that is why they are not used as separators).\n");
langDetectParser.addArgument("-l", "--langs")
.setDefault("eu,es,en,fr")
.help("list of accepted langs. Use iso-639 codes separated by commas (e.g. --langs=es,eu,en,fr) Default is 'eu,es,en,fr'.\n"
+ "NOTE 1: languages are defined in the config file.\n"
+ "NOTE 2: before activating this option make sure twitter identifies all languages you are working with, especially in case of less-resourced languages."
+ "NOTE 3: even if this option is active MSM will perform its own language identification, and leverage it with Twitter info.\n");
langDetectParser.addArgument("-tl", "--twitterLangid")
.action(Arguments.storeTrue())
.help("Whether the crawler shall trust twitter to filter languages or not. Default is no.\n"
+ "NOTE 1: languages are defined in the config file.\n"
+ "NOTE 2: before activating this option make sure twitter identifies all languages you are working with, especially in case of less-resourced languages."
+ "NOTE 3: even if this option is active MSM will perform its own language identification, and leverage it with Twitter info.\n");
langDetectParser.addArgument("-o", "--onlySpecificLanguageProfiles")
.action(Arguments.storeTrue())
.help("Do not load all language profiles, only those specified in --langs argument.\n");
langDetectParser.addArgument("-t", "--type")
.choices("twitter", "longtext")
.setDefault("twitter")
.help("which type of texts are we dealing with:\n"
+ "\t - \"twitter\" : microbloging messages or short messages\n"
+ "\t - \"longtext\" : paragraphs or longer sentences\n"
+ "\t\tWARNING: Nothing.\n");
langDetectParser.addArgument("-tr", "--train")
.action(Arguments.storeTrue())
.help("train niew model with the given files in the --strings parameter\n"
+ "\t\tWARNING: langs argument value is used as the language name to store the new language profile."
+ "\t\tWARNING: type is sued to generate short or standard text profile"
+ "\t\tWARNING: profile is stored in the same place of the input file, with the lang name and \"ld_profile\" string. e.g. input_es_ldprofile\n");


}



/**
* Dummy function to get the version of this software from the pom.properties file.
* @return
Expand Down
48 changes: 44 additions & 4 deletions src/main/java/elh/eus/MSM/LangDetect.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package elh.eus.MSM;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

Expand Down Expand Up @@ -47,16 +48,28 @@ public class LangDetect {
//Pattern to trust Twitter lang identification for certain languages
private Pattern twtLangs = Pattern.compile("(en|es|fr|de|tr)");
//Pattern to normalize hashtags and user names
private Pattern userhashtag = Pattern.compile("[#@]([\\p{L}\\p{M}\\p{Nd}_]+\\b)");
private Pattern hashtag = Pattern.compile("[#]([\\p{L}\\p{M}\\p{Nd}_]+\\b)");
private Pattern user = Pattern.compile("[@]([\\p{L}\\p{M}\\p{Nd}_]+\\b)");
//Pattern to match urls in tweets. There are more efficient ways to do this but, for
//the moment this is a fast solution
private Pattern urlPattern = Pattern.compile("([fh]t?tps?://)?[a-zA-Z_0-9\\-]+(\\.\\w[a-zA-Z_0-9\\-]+)+(/[#&\\n\\-=?\\+\\%/\\.\\w]+)?");

public LangDetect()
{
this(new ArrayList<String>());
}

public LangDetect(List<String> langprofs)
{
try {
//load all languages:
languageProfiles = new LanguageProfileReader().readAllBuiltIn();
if (langprofs.isEmpty()) {
//load all languages:
languageProfiles = new LanguageProfileReader().readAllBuiltIn();
}
else {
//load only specific language profiles
languageProfiles = new LanguageProfileReader().read(langprofs);
}
}
catch (IOException ioe){
System.err.println("Utils::detectLanguage -> Error when loading language models");
Expand Down Expand Up @@ -162,7 +175,8 @@ public String detectTwtLanguage(String input, String supposedLang)
{
String result = "unk";
//query:
String detectStr = userhashtag.matcher(input).replaceAll(" $1");
String detectStr = hashtag.matcher(input).replaceAll(" $1");
detectStr = user.matcher(input).replaceAll(" ");
detectStr = urlPattern.matcher(detectStr).replaceAll("").replaceAll("\\s+", " ");
TextObject textObject = textObjectFactory.forText(detectStr);
List<DetectedLanguage> langs = languageDetector.getProbabilities(textObject);
Expand Down Expand Up @@ -191,4 +205,30 @@ public String detectTwtLanguage(String input, String supposedLang)
return result;
}


/**
* return the probabilities for an input text.
*
*
*
* @param input
* @return
*/
public String probabilities(String input)
{
//query:
TextObject textObject = textObjectFactory.forText(input);
List<DetectedLanguage> langs = languageDetector.getProbabilities(textObject);
StringBuilder sb = new StringBuilder();
sb.append("{");
for (DetectedLanguage l : langs)
{
//System.err.println("Utils::detectLanguage -> lang for text "+textObject+" ("+langs.indexOf(l) +") -> "+l.toString()+" ("+l.getLocale().getLanguage()+")");
double prob = l.getProbability();
sb.append(l.getLocale()).append(":").append(String.valueOf(l.getProbability())).append("; ");
}
sb.append("}");
return sb.toString();
}

}

0 comments on commit d388f05

Please sign in to comment.