added language detection interface, and updated language detection im…

…plementation. new version 1.3.6
orai-nlp · Jul 12, 2018 · d388f05 · d388f05
1 parent 8c7bedf
commit d388f05
Show file tree

Hide file tree

Showing 3 changed files with 194 additions and 8 deletions.
diff --git a/pom.xml b/pom.xml
@@ -31,7 +31,7 @@
 		<dependency>
 			<groupId>junit</groupId>
 			<artifactId>junit</artifactId>
-			<version>4.11</version>
+			<version>4.12</version>
 			<scope>test</scope>
 		</dependency>
 		<dependency>
@@ -103,7 +103,7 @@
     	<dependency>
 		  <groupId>mysql</groupId>
 	      <artifactId>mysql-connector-java</artifactId>
-		  <version>5.1.36</version>
+		  <version>5.1.46</version>
 		</dependency>
 		<dependency>
 			<groupId>com.robbypond</groupId>
@@ -198,5 +198,5 @@
 		</plugins>
 	</build>
 	<url>https://github.com/Elhuyar/MSM.git</url>
-	<version>1.3.5</version>
+	<version>1.3.6</version>
 </project>
diff --git a/src/main/java/elh/eus/MSM/CLI.java b/src/main/java/elh/eus/MSM/CLI.java
@@ -24,15 +24,31 @@
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.sql.Connection;
 import java.sql.SQLException;
+import java.util.Arrays;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Properties;
 import java.util.Set;
 
 import javax.naming.NamingException;
 
+import org.apache.commons.io.FileUtils;
+
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileBuilder;
+import com.optimaize.langdetect.profiles.LanguageProfileWriter;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObject;
+import com.optimaize.langdetect.text.TextObjectFactory;
+
 import net.sourceforge.argparse4j.ArgumentParsers;
 import net.sourceforge.argparse4j.impl.Arguments;
 import net.sourceforge.argparse4j.inf.ArgumentParser;
@@ -98,6 +114,13 @@ public class CLI {
 	 */
 	private Subparser influenceTaggerParser;
 
+
+	/**
+	 * The parser that manages the language detection sub-command.
+	 */
+	private Subparser langDetectParser;
+
+
 	/**
 	 * The parser that manages the twitter user info sub-command.
 	 */
@@ -123,6 +146,8 @@ public CLI() {
 		loadInfluenceTaggerParameters();
 		twitterUserParser = subParsers.addParser("twtUser").help("Twitter user info CLI");
 		loadTwitterUserInfoParameters();
+		langDetectParser = subParsers.addParser("langid").help("language Detection CLI");
+		loadLangDetectParameters();
 		userLocationGeocoderParser = subParsers.addParser("geocode").help("Geocoder for twitter user locations info CLI");
 		loadUserLocationGeocoderParameters();
 
@@ -220,11 +245,14 @@ else if (args[0].equals("geocode")) {
 			else if (args[0].equals("twtUser")) {
 				twtUserUInfo();
 			}
+			else if (args[0].equals("langid")) {
+				langDetect();
+			}
 
 		} catch (ArgumentParserException e) {
 			argParser.handleError(e);
 			System.out.println("Run java -jar target/MSM-" + version
-					+ ".jar (twitter|feed|influence|twtUser|geocode) -help for details");
+					+ ".jar (twitter|feed|influence|twtUser|langid|geocode) -help for details");
 			System.exit(1);
 		}
 	}
@@ -544,6 +572,84 @@ public final void twtUserUInfo()
 		} 		
 	}
 
+	public final void langDetect()
+	{
+		String strings = parsedArguments.getString("strings");
+		String langs = parsedArguments.getString("langs");
+		String type = parsedArguments.getString("type");
+
+		boolean twitterlangs = parsedArguments.getBoolean("twitterLangid");
+		boolean train = parsedArguments.getBoolean("train");
+		boolean allLangprofs = parsedArguments.getBoolean("onlySpecificLanguageProfiles");
+
+
+
+
+		List<String> acceptedLangs = Arrays.asList(langs.split(","));
+
+		LangDetect lid;
+		if (allLangprofs) {
+			lid = new LangDetect(acceptedLangs);
+		}
+		else {
+			lid = new LangDetect();
+		}
+		String input = strings;
+		String lang = "unk";
+		if (MSMUtils.checkFile(strings))
+		{
+			try {
+				input = FileUtils.readFileToString(new File(strings), StandardCharsets.UTF_8);
+			} catch (IOException e) {
+				System.err.println("MSM::langDetect - ERROR when reading from file.");				
+			}
+		}			
+
+		if (train && input.equals(strings))
+		{
+			//create text object factory:
+			TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexingCleanText();
+
+			//load your training text:
+			TextObject inputText = textObjectFactory.create()
+			        .append(input);
+
+
+			//create the profile:
+			LanguageProfile languageProfile = new LanguageProfileBuilder(LdLocale.fromString(langs))
+			        .ngramExtractor(NgramExtractors.standard())
+			        .minimalFrequency(5) //adjust please
+			        .addText(inputText)
+			        .build();
+
+			//store it to disk if you like:
+			try {
+				new LanguageProfileWriter().write(languageProfile, new FileOutputStream(new File(input+"_"+langs+"_ldprofile")));
+			} catch (IOException e) {
+				System.err.println("MSM::langDetect - ERROR when writing language profile to file "+input+"_"+langs+"_ldprofile .");
+				System.exit(1);
+			}
+		}
+		else if (train)
+		{
+			System.err.println("MSM::langDetect - ERROR: train activated but invalid file given. Exiting now");
+			System.exit(1);
+		}
+
+		switch (type)
+		{
+		case "twitter":
+			lang = lid.detectTwtLanguage(input,langs);
+			break;
+		default: 
+			lang = lid.detectFeedLanguage(input,langs);
+		}
+
+		System.out.println(input+"\n"+lang+" - probs: "+lid.probabilities(input)+"\n-------------------------------------------");
+	}
+
+
+
 
 	public final void loadTwitterCrawlerParameters()
 	{
@@ -743,6 +849,46 @@ public final void loadUserLocationGeocoderParameters()
 	}
 
 
+	private void loadLangDetectParameters()
+	{
+		langDetectParser.addArgument("-s", "--strings")
+		.required(true)		
+		.help("string to look for its language, or file containing strings. The language detection unit is the whole file"
+				+ "Many locations may be introduced separated by '::' string (semicolon may be used inside the location string, that is why they are not used as separators).\n");
+		langDetectParser.addArgument("-l", "--langs")		
+		.setDefault("eu,es,en,fr")
+		.help("list of accepted langs. Use iso-639 codes separated by commas (e.g. --langs=es,eu,en,fr) Default is 'eu,es,en,fr'.\n"
+				+ "NOTE 1: languages are defined in the config file.\n"
+				+ "NOTE 2: before activating this option make sure twitter identifies all languages you are working with, especially in case of less-resourced languages."
+				+ "NOTE 3: even if this option is active MSM will perform its own language identification, and leverage it with Twitter info.\n");
+		langDetectParser.addArgument("-tl", "--twitterLangid")		
+		.action(Arguments.storeTrue())
+		.help("Whether the crawler shall trust twitter to filter languages or not. Default is no.\n"
+				+ "NOTE 1: languages are defined in the config file.\n"
+				+ "NOTE 2: before activating this option make sure twitter identifies all languages you are working with, especially in case of less-resourced languages."
+				+ "NOTE 3: even if this option is active MSM will perform its own language identification, and leverage it with Twitter info.\n");
+		langDetectParser.addArgument("-o", "--onlySpecificLanguageProfiles")		
+		.action(Arguments.storeTrue())
+		.help("Do not load all language profiles, only those specified in --langs argument.\n");		
+		langDetectParser.addArgument("-t", "--type")
+		.choices("twitter", "longtext")
+		.setDefault("twitter")
+		.help("which type of texts are we dealing with:\n"
+				+ "\t - \"twitter\" : microbloging messages or short messages\n"
+				+ "\t - \"longtext\" : paragraphs or longer sentences\n"				
+				+ "\t\tWARNING: Nothing.\n");
+		langDetectParser.addArgument("-tr", "--train")
+		.action(Arguments.storeTrue())
+		.help("train niew model with the given files in the --strings parameter\n"
+				+ "\t\tWARNING: langs argument value is used as the language name to store the new language profile."
+				+ "\t\tWARNING: type is sued to generate short or standard text profile"
+				+ "\t\tWARNING: profile is stored in the same place of the input file, with the lang name and \"ld_profile\" string. e.g. input_es_ldprofile\n");
+
+
+	}
+
+
+
 	/**
 	 * Dummy function to get the version of this software from the pom.properties file.
 	 * @return

diff --git a/src/main/java/elh/eus/MSM/LangDetect.java b/src/main/java/elh/eus/MSM/LangDetect.java
@@ -19,6 +19,7 @@
 package elh.eus.MSM;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Pattern;
 
@@ -47,16 +48,28 @@ public class LangDetect {
 	//Pattern to trust Twitter lang identification for certain languages
 	private Pattern twtLangs = Pattern.compile("(en|es|fr|de|tr)");
 	//Pattern to normalize hashtags and user names
-	private Pattern userhashtag = Pattern.compile("[#@]([\\p{L}\\p{M}\\p{Nd}_]+\\b)");
+	private Pattern hashtag = Pattern.compile("[#]([\\p{L}\\p{M}\\p{Nd}_]+\\b)");
+	private Pattern user = Pattern.compile("[@]([\\p{L}\\p{M}\\p{Nd}_]+\\b)");
 	//Pattern to  match urls in tweets. There are more efficient ways to do this but, for 
 	//the moment this is a fast solution
 	private Pattern urlPattern = Pattern.compile("([fh]t?tps?://)?[a-zA-Z_0-9\\-]+(\\.\\w[a-zA-Z_0-9\\-]+)+(/[#&\\n\\-=?\\+\\%/\\.\\w]+)?");  
 
 	public LangDetect()
+	{
+		this(new ArrayList<String>());		
+	}
+
+	public LangDetect(List<String> langprofs)
 	{
 		try {
-			//load all languages:
-			languageProfiles = new LanguageProfileReader().readAllBuiltIn();			
+			if (langprofs.isEmpty()) {
+				//load all languages:
+				languageProfiles = new LanguageProfileReader().readAllBuiltIn();
+			}
+			else {
+				//load only specific language profiles
+				languageProfiles = new LanguageProfileReader().read(langprofs);
+			}
 		}
 		catch (IOException ioe){
 			System.err.println("Utils::detectLanguage -> Error when loading language models");
@@ -162,7 +175,8 @@ public String detectTwtLanguage(String input, String supposedLang)
 	{
 		String result = "unk";
 		//query:
-		String detectStr = userhashtag.matcher(input).replaceAll(" $1");
+		String detectStr = hashtag.matcher(input).replaceAll(" $1");
+		detectStr = user.matcher(input).replaceAll(" ");
 		detectStr = urlPattern.matcher(detectStr).replaceAll("").replaceAll("\\s+", " ");
 		TextObject textObject = textObjectFactory.forText(detectStr);
 		List<DetectedLanguage> langs = languageDetector.getProbabilities(textObject);		
@@ -191,4 +205,30 @@ public String detectTwtLanguage(String input, String supposedLang)
 		return result;
 	}
 
+
+	/**
+	 *   return the probabilities for an input text. 
+	 *   
+	 *    
+	 * 
+	 * @param input
+	 * @return
+	 */
+	public String probabilities(String input)
+	{	
+		//query:
+		TextObject textObject = textObjectFactory.forText(input);
+		List<DetectedLanguage> langs = languageDetector.getProbabilities(textObject);
+		StringBuilder sb = new StringBuilder();
+		sb.append("{");
+		for (DetectedLanguage l : langs)
+		{			
+			//System.err.println("Utils::detectLanguage -> lang for text "+textObject+" ("+langs.indexOf(l) +") -> "+l.toString()+" ("+l.getLocale().getLanguage()+")");
+			double prob = l.getProbability();
+			sb.append(l.getLocale()).append(":").append(String.valueOf(l.getProbability())).append("; ");
+		}
+		sb.append("}");		
+		return sb.toString();
+	}
+
 }