Skip to content

Commit

Permalink
feat: use proper generics
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed Feb 17, 2022
1 parent bb6a11e commit f552f2d
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 184 deletions.
Original file line number Diff line number Diff line change
@@ -1,191 +1,181 @@
package org.apache.solr.analysis.author;


import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Creates a variations of the author names, it receives eg:
* John, K
* Creates a variations of the author names, it receives eg: John, K
*/

public class AuthorQueryVariations {

public static final Logger log = LoggerFactory.getLogger(AuthorQueryVariations.class);

/**
* This method takes input string, e.g. "Hector, Gomez Q" and generates variations
* of the author name.
*
* <pre>
* HECTOR, GOMEZ
* HECTOR, G
* HECTOR,
* HECTOR, GOMEZ Q*
* HECTOR, G Q*
* </pre>
*
* It is essentially the same output as you get from getSynonymVariations except
* a few special cases. These special cases are variations needed for querying the
* index of author names, but not needed or wanted for the process of transforming
* the curated synonyms
* <p>
* Example "h quintero gomez" will output:
* <pre>
* GOMEZ, H\w* QUINTERO\b.*
* GOMEZ,
* GOMEZ, H\w*
* GOMEZ, H\w* Q\b.* &lt;-- only this one is extra added
* </pre>
*
* @param authorString
* author name
* @return map with string mappings
*/
public static HashSet<String> getQueryVariationsInclRegex(String authorString) {

HashMap<String,String> parsedAuthor = null;
parsedAuthor = AuthorUtils.parseAuthor(authorString);

HashSet<String> variations = new LinkedHashSet<String>();
if (parsedAuthor == null) {
variations.add(authorString);
return variations;
}
generateNameVariations(parsedAuthor, variations);
return variations;
}

public static HashSet<String> generateNameVariations(HashMap<String,String> parsedAuthor) {
HashSet<String> variations = new LinkedHashSet<String>();
return generateNameVariations(parsedAuthor, variations);
}

protected static HashSet<String> generateNameVariations(
HashMap<String,String> parsedAuthor,
HashSet<String> variations) {

// get the base variations
generateSynonymVariations(parsedAuthor, variations);

// add the variations that are needed only for the query phase
String last = parsedAuthor.get("last");
String first = parsedAuthor.get("first");
String middle = parsedAuthor.get("middle");

if (first != null) {
if (middle != null) {
if (first.length() > 1) {
if (middle.length() > 1) {
variations.add(last + ", " + first + " " + middle.substring(0,1) + "\\b.*");
} else {
variations.add(last + ", " + first + " " + middle + ".*");
variations.add(last + ", " + first.substring(0,1) + " " + middle + ".*");
}
} else {
if (middle.length() > 1) {
variations.add(last + ", " + first + "\\w* " + middle.substring(0,1) + "\\b.*");
}
}
} else {
if (first.length() > 1) {
variations.add(last + ", " + first.substring(0,1) + "\\b.*");
}
}
}

return variations;
}




/**
* This method takes input string, e.g. "Hector, Gomez Q" and generates variations
* of the author name PLUS enhances the variations with regular expression patterns.
*
* The process that transforms the curated synonyms uses *only* the variations
* generated here. This limited set is also included in the variations used at query
* time but DON'T ADD THINGS HERE that are only necessary for the query phase--use
* getNameVariations for that
*
* HECTOR, GOMEZ
* HECTOR, G
* HECTOR,
* HECTOR, GOMEZ Q*
* HECTOR, G Q*
*
*
* @param authorString
* name in the natural form
* @return map with string mappings
*/
public static HashSet<String> getQueryVariations(String authorString) {

HashMap<String,String> parsedAuthor = null;
parsedAuthor = AuthorUtils.parseAuthor(authorString);

HashSet<String> variations = new LinkedHashSet<String>();
if (parsedAuthor == null) {
variations.add(authorString);
return variations;
}
return generateSynonymVariations(parsedAuthor, variations);
}

public static HashSet<String> generateSynonymVariations(HashMap<String,String> parsedAuthor) {
HashSet<String> variations = new LinkedHashSet<String>();
return generateSynonymVariations(parsedAuthor, variations);
}

protected static HashSet<String> generateSynonymVariations(
HashMap<String,String> parsedAuthor,
HashSet<String> variations) {

String last = parsedAuthor.get("Last");
String first = parsedAuthor.get("First");
String middle = parsedAuthor.get("Middle");

if (parsedAuthor.size() == 1 && last != null) {
variations.add(String.format("%s,.*", last)); // all we got was last name
} else {
variations.add(String.format("%s,", last));
}

if (first != null) {
if (middle != null) {
if (first.length() > 1) {
variations.add(last + ", " + first);
variations.add(last + ", " + first.substring(0,1));
if (middle.length() > 1) {
variations.add(last + ", " + first + " " + middle + "\\b.*");
variations.add(last + ", " + first.substring(0,1) + " " + middle.substring(0,1) + "\\b.*");
} else if (middle.length() == 1) {
// variations.add(last + ", " + first.substring(0,1) + " " + middle + ".*");
// variations.add(last + ", " + first + " " + middle + ".*");
}
} else {
variations.add(last + ", " + first + "\\w*");
if (middle.length() > 1) {
variations.add(last + ", " + first + "\\w* " + middle + "\\b.*");
} else if (middle.length() == 1) {
variations.add(last + ", " + first + "\\w* " + middle + ".*");
}
}
} else {
if (first.length() > 1) {
variations.add(last + ", " + first + "\\b.*");
variations.add(last + ", " + first.substring(0,1));
} else if (first.length() == 1) {
variations.add(last + ", " + first + ".*");
}
}
}

return variations;
}
public static final Logger log = LoggerFactory.getLogger(AuthorQueryVariations.class);

/**
* This method takes input string, e.g. "Hector, Gomez Q" and generates
* variations of the author name.
*
* <pre>
* HECTOR, GOMEZ
* HECTOR, G
* HECTOR,
* HECTOR, GOMEZ Q*
* HECTOR, G Q*
* </pre>
*
* It is essentially the same output as you get from getSynonymVariations except
* a few special cases. These special cases are variations needed for querying
* the index of author names, but not needed or wanted for the process of
* transforming the curated synonyms
* <p>
* Example "h quintero gomez" will output:
*
* <pre>
* GOMEZ, H\w* QUINTERO\b.*
* GOMEZ,
* GOMEZ, H\w*
* GOMEZ, H\w* Q\b.* &lt;-- only this one is extra added
* </pre>
*
* @param authorString author name
* @return map with string mappings
*/
public static HashSet<String> getQueryVariationsInclRegex(String authorString) {

Map<String, String> parsedAuthor = null;
parsedAuthor = AuthorUtils.parseAuthor(authorString);

HashSet<String> variations = new LinkedHashSet<String>();
if (parsedAuthor == null) {
variations.add(authorString);
return variations;
}
generateNameVariations(parsedAuthor, variations);
return variations;
}

public static Set<String> generateNameVariations(HashMap<String, String> parsedAuthor) {
HashSet<String> variations = new LinkedHashSet<String>();
return generateNameVariations(parsedAuthor, variations);
}

protected static Set<String> generateNameVariations(Map<String, String> parsedAuthor, Set<String> variations) {

// get the base variations
generateSynonymVariations(parsedAuthor, variations);

// add the variations that are needed only for the query phase
String last = parsedAuthor.get("last");
String first = parsedAuthor.get("first");
String middle = parsedAuthor.get("middle");

if (first != null) {
if (middle != null) {
if (first.length() > 1) {
if (middle.length() > 1) {
variations.add(last + ", " + first + " " + middle.substring(0, 1) + "\\b.*");
} else {
variations.add(last + ", " + first + " " + middle + ".*");
variations.add(last + ", " + first.substring(0, 1) + " " + middle + ".*");
}
} else {
if (middle.length() > 1) {
variations.add(last + ", " + first + "\\w* " + middle.substring(0, 1) + "\\b.*");
}
}
} else {
if (first.length() > 1) {
variations.add(last + ", " + first.substring(0, 1) + "\\b.*");
}
}
}

return variations;
}

/**
* This method takes input string, e.g. "Hector, Gomez Q" and generates
* variations of the author name PLUS enhances the variations with regular
* expression patterns.
*
* The process that transforms the curated synonyms uses *only* the variations
* generated here. This limited set is also included in the variations used at
* query time but DON'T ADD THINGS HERE that are only necessary for the query
* phase--use getNameVariations for that
*
* HECTOR, GOMEZ HECTOR, G HECTOR, HECTOR, GOMEZ Q* HECTOR, G Q*
*
*
* @param authorString name in the natural form
* @return map with string mappings
*/
public static HashSet<String> getQueryVariations(String authorString) {

Map<String, String> parsedAuthor = null;
parsedAuthor = AuthorUtils.parseAuthor(authorString);

HashSet<String> variations = new LinkedHashSet<String>();
if (parsedAuthor == null) {
variations.add(authorString);
return variations;
}
return generateSynonymVariations(parsedAuthor, variations);
}

public static HashSet<String> generateSynonymVariations(HashMap<String, String> parsedAuthor) {
HashSet<String> variations = new LinkedHashSet<String>();
return generateSynonymVariations(parsedAuthor, variations);
}

protected static HashSet<String> generateSynonymVariations(Map<String, String> parsedAuthor,
Set<String> variations) {

String last = parsedAuthor.get("Last");
String first = parsedAuthor.get("First");
String middle = parsedAuthor.get("Middle");

if (parsedAuthor.size() == 1 && last != null) {
variations.add(String.format("%s,.*", last)); // all we got was last name
} else {
variations.add(String.format("%s,", last));
}

if (first != null) {
if (middle != null) {
if (first.length() > 1) {
variations.add(last + ", " + first);
variations.add(last + ", " + first.substring(0, 1));
if (middle.length() > 1) {
variations.add(last + ", " + first + " " + middle + "\\b.*");
variations.add(last + ", " + first.substring(0, 1) + " " + middle.substring(0, 1) + "\\b.*");
} else if (middle.length() == 1) {
// variations.add(last + ", " + first.substring(0,1) + " " + middle + ".*");
// variations.add(last + ", " + first + " " + middle + ".*");
}
} else {
variations.add(last + ", " + first + "\\w*");
if (middle.length() > 1) {
variations.add(last + ", " + first + "\\w* " + middle + "\\b.*");
} else if (middle.length() == 1) {
variations.add(last + ", " + first + "\\w* " + middle + ".*");
}
}
} else {
if (first.length() > 1) {
variations.add(last + ", " + first + "\\b.*");
variations.add(last + ", " + first.substring(0, 1));
} else if (first.length() == 1) {
variations.add(last + ", " + first + ".*");
}
}
}

return variations;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ public static String normalizeAuthor(String a) {
/**
* Method used by the tokenizer chain to normalize author names.
*
* @param a
* @param keepApostrophe
* @param a - author name
* @param keepApostrophe - bool, preserve apostrophe
* @return Normalized string
*/
public static String normalizeAuthor(String a, boolean keepApostrophe) {
Expand Down Expand Up @@ -82,7 +82,7 @@ public static String normalizeAuthor(String a, boolean keepApostrophe) {
/**
* Utility method to split string (author name) into constituting parts
*
* @param a
* @param a - author name
* @return map with 'last', 'first', 'middle' keys
*/
public static Map<String,String> parseAuthor(String a) {
Expand All @@ -102,8 +102,8 @@ public static Map<String,String> parseAuthor(String a, boolean normalize) {
* Utility method employed by AuthorTransliterationTokenizer and also by other components
* inside the parser chain to discover other potential reading of the author's name.
*
* @param a
* @return
* @param a - author name as string
* @return - list of author name variants in ascii form
*/
public static ArrayList<String> getAsciiTransliteratedVariants(String a) {
HashSet<String> synonyms = new HashSet<String>();
Expand Down
Loading

0 comments on commit f552f2d

Please sign in to comment.