From 33dcb9a41a98b246fefdfc04aeb44725ff7d5838 Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Thu, 3 Feb 2022 16:29:24 -0500 Subject: [PATCH] feat: removed all from unfielded search and also from copyField in schema.xml --- .../author/AuthorQueryVariations.java | 6 +- .../solr/analysis/author/AuthorUtils.java | 411 ++++++++---------- .../solr/analysis/author/NameParser.java | 291 +------------ .../server/solr/collection1/conf/schema.xml | 28 +- .../solr/collection1/conf/solrconfig.xml | 34 +- 5 files changed, 216 insertions(+), 554 deletions(-) diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariations.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariations.java index 37f89c470..9fa001d11 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariations.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariations.java @@ -146,9 +146,9 @@ protected static HashSet generateSynonymVariations( HashMap parsedAuthor, HashSet variations) { - String last = parsedAuthor.get("last"); - String first = parsedAuthor.get("first"); - String middle = parsedAuthor.get("middle"); + String last = parsedAuthor.get("Last"); + String first = parsedAuthor.get("First"); + String middle = parsedAuthor.get("Middle"); if (parsedAuthor.size() == 1 && last != null) { variations.add(String.format("%s,.*", last)); // all we got was last name diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java index f9189679c..7a850c53d 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java @@ -5,26 +5,27 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.jython.JythonObjectFactory; +import org.jython.monty.interfaces.JythonNameParser; + import java.text.*; import static net.gcardone.junidecode.Junidecode.unidecode; -/** - * OK; i'll admit it - this is extremely ugly piece of code which i inherited; - * I tried to rewrite it once, but it was so convoluted that I resigned. - * - * - */ public class AuthorUtils { - + + + static final NameParser nameParser = new NameParser(); + public static final String AUTHOR_QUERY_VARIANT = "AUTHOR_QUERY_VARIANT"; public static final String AUTHOR_INPUT = "AUTHOR_INPUT"; public static final String AUTHOR_TRANSLITERATED = "AUTHOR_TRANSLITERATED"; public static final String AUTHOR_CURATED_SYN = "AUTHOR_CURATED_SYN"; - + // to remove commas from behind initials B. => B static Pattern n0 = Pattern.compile("(?<=\\b\\p{L})\\.(?=\\s*\\b)"); // these are the characters we allow for author names @@ -32,158 +33,94 @@ public class AuthorUtils { // and some special characters // original, which may miss diacritics: "(?<=\\b\\p{L})\\.(?=\\s*\\b)" \P{M}\p{M}*+ // [^,\\-\\w\\s\\{N}\\p{L}\\p{M}*+] - - + + static Pattern n1 = Pattern.compile("[^,\\-\\s\\p{N}\\p{L}\\p{M}]"); static Pattern n1b = Pattern.compile("[^,\\-\\s\\'\\p{N}\\p{L}\\p{M}]"); - + // to normalize spaces static Pattern n2 = Pattern.compile("\\s+"); // to normalize non escaped commas static Pattern n3 = Pattern.compile("(? -1 || a.indexOf('?') > -1; // \*\? should never be encountered here - if (!keepApostrophe) - a = n4.matcher(a).replaceAll("-"); - a = n0.matcher(a).replaceAll(" "); - if (keepApostrophe) - a = n1b.matcher(a).replaceAll(""); - else - a = n1.matcher(a).replaceAll(""); - a = n3.matcher(a).replaceAll(", "); - a = n2.matcher(a.trim()).replaceAll(" "); - - - if (!hasWildcards && !(a.contains(","))) // || a.contains(" ") - a = a + ","; - // do this at the end, we want to see the space instead of '-' - a = a.replace('-', ' '); - // normalize spaces once again - a = n2.matcher(a.trim()).replaceAll(" "); - return a; - } - - /** - * this whole thing become obsolete when we included the python - * name parser library (that does MUCH better job in parsing names) - * - * TODO: kill AuthorUtils.parseAuthor - */ - public static HashMap parseAuthor(String a) { + if (!keepApostrophe) + a = n4.matcher(a).replaceAll("-"); + a = n0.matcher(a).replaceAll(" "); + if (keepApostrophe) + a = n1b.matcher(a).replaceAll(""); + else + a = n1.matcher(a).replaceAll(""); + a = n3.matcher(a).replaceAll(", "); + a = n2.matcher(a.trim()).replaceAll(" "); + + + if (!hasWildcards && !(a.contains(","))) // || a.contains(" ") + a = a + ","; + // do this at the end, we want to see the space instead of '-' + a = a.replace('-', ' '); + // normalize spaces once again + a = n2.matcher(a.trim()).replaceAll(" "); + return a; + } + + + public static Map parseAuthor(String a) { return parseAuthor(a, true); } - - public static HashMap parseAuthor(String a, boolean normalize) { - HashMap parsed = new HashMap(); - if (a == null || a.length() == 0) { - return parsed; - } + + public static Map parseAuthor(String a, boolean normalize) { if (normalize) { - a = AuthorUtils.normalizeAuthor(a); - } - NameParser np = new NameParser(); - String[] p; - try { - p = np.parseName(a); - } catch (Exception e) { - throw new RuntimeException(e); + return nameParser.parseName(AuthorUtils.normalizeAuthor(a)); } - String[] keys = {"title", "first", "middle", "last", "suffix"}; - for (int i = 0; i < keys.length; i++) { - if (p[i] != null) { - parsed.put(keys[i], p[i]); - } + else { + return nameParser.parseName(a); } - return parsed; } - + public static ArrayList getAsciiTransliteratedVariants(String a) { HashSet synonyms = new HashSet(); //a = a.toUpperCase(); - + // include original synonyms.add(a); - + // downgrade to ascii String downgraded = foldToAscii(a); synonyms.add(downgraded); - - // transliterate accents - String transAcc = transliterateAccents(a); - synonyms.add(transAcc); - + + // handle russian name stuff HashSet transRus = transliterateRussianNames(synonyms); synonyms.addAll(transRus); - + // apostrophes are now preserved in the index // so we need to generate translits for those if (a.contains("'")) - synonyms.add(a.replace("'", "")); - + synonyms.add(a.replace("'", "")); + // remove the original input from the set synonyms.remove(a); - + return new ArrayList(synonyms); } - + public static String foldToAscii(String a) { - String b = unidecode(a.trim()); - if (b.contains(" ,")) - b = b.replace(" ,", ","); - return b; + String b = unidecode(a.trim()); + if (b.contains(" ,")) + b = b.replace(" ,", ","); + return b; } - - static String transliterateAccents(String a) { - String decomposed = Normalizer.normalize(a, Normalizer.Form.NFD); - char[] in = decomposed.toCharArray(); - char[] out = new char[in.length * 4]; - int outPos = 0; - for (int i = 0; i < in.length; i++) { - final char c = in[i]; - // prev will be the 1st part of the decomp char - char prev = (i > 0) ? in[i - 1] : '\0'; - char replacement; - if (c < '\u0080') { - out[outPos++] = c; - continue; - } - switch (c) { - case '\u0141': - replacement = 'L'; - break; - case '\u0308': - replacement = 'E'; - break; - case '\u030a': - replacement = 'A'; - break; - case '\u0301': - replacement = 'E'; - break; - case '\u030c': - replacement = 'H'; - break; - default: - prev = '\0'; - replacement = c; - } - if (prev != '\0' && !Character.isUpperCase(prev)) { - replacement = Character.toLowerCase(replacement); - } - out[outPos++] = replacement; - } - return String.copyValueOf(out).trim(); - } - + + /* * Splits name into parts (separated by comma and then by space) * The comma is retained; spaces between parts of names are removed @@ -196,7 +133,7 @@ public static String[] splitName(String name) { String[] nameParts = name.substring(comma+1).trim().split(" "); if (nameParts[0].equals("")) return new String[]{name.substring(0, comma).trim() + ","}; - + String[] out = new String[nameParts.length+1]; out[0] = name.substring(0, comma).trim() + ","; int i = 1; @@ -211,10 +148,10 @@ public static String[] splitName(String name) { return name.split(" "); } } - - - // XXX: this doesn't look right to me, the fifth step gets (possibly) + + + // XXX: this doesn't look right to me, the fifth step gets (possibly) // 5 times more items than the first step public static HashSet transliterateRussianNames(Set in) { HashSet synonyms = new HashSet(); @@ -232,181 +169,181 @@ public static HashSet transliterateRussianNames(Set in) { } return synonyms; } - - /* - * take care of russian apostrophes: - * 'E => E == IE == YE - * note that we do not index 'E since the search - * engine simply strips all apostrophes - */ + + /* + * take care of russian apostrophes: + * 'E => E == IE == YE + * note that we do not index 'E since the search + * engine simply strips all apostrophes + */ static Pattern p0 = Pattern.compile("(?<=\\w{2})'(?=[Ee])"); static HashSet translitRussianApostrophes(Iterator itr) { HashSet syn = new HashSet(); - + String x; while (itr.hasNext()) { - x = itr.next(); + x = itr.next(); Matcher m = p0.matcher(x); if (m.find()) { - if (x.charAt(m.end()) == 'E') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("Y")); - syn.add(m.replaceAll("")); - } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("y")); - syn.add(m.replaceAll("")); - } + if (x.charAt(m.end()) == 'E') { + syn.add(m.replaceAll("I")); + syn.add(m.replaceAll("Y")); + syn.add(m.replaceAll("")); + } + else { + syn.add(m.replaceAll("i")); + syn.add(m.replaceAll("y")); + syn.add(m.replaceAll("")); + } } } //log.debug("apostrophes: " + syn); return syn; } - - /* russian last names I: - * [^IJY]EV$ => IEV$ == YEV$ == JEV$ - * [^IJY]EVA$ => IEVA$ == YEVA$ == JEVA$ - */ + + /* russian last names I: + * [^IJY]EV$ => IEV$ == YEV$ == JEV$ + * [^IJY]EVA$ => IEVA$ == YEVA$ == JEVA$ + */ static Pattern p1 = Pattern.compile("(? translitRussianLastNames1(Iterator itr) { HashSet syn = new HashSet(); String x; while (itr.hasNext()) { - x = itr.next(); + x = itr.next(); Matcher m = p1.matcher(x); if (m.find()) { - if (x.charAt(m.start()) == 'E') { - syn.add(m.replaceAll("IEV")); - syn.add(m.replaceAll("YEV")); - syn.add(m.replaceAll("JEV")); - } - else { - syn.add(m.replaceAll("iev")); - syn.add(m.replaceAll("yev")); - syn.add(m.replaceAll("jev")); - } - + if (x.charAt(m.start()) == 'E') { + syn.add(m.replaceAll("IEV")); + syn.add(m.replaceAll("YEV")); + syn.add(m.replaceAll("JEV")); + } + else { + syn.add(m.replaceAll("iev")); + syn.add(m.replaceAll("yev")); + syn.add(m.replaceAll("jev")); + } + } } //log.debug("last names I: " + syn); return syn; - } - - /* russian last names II: - * ([NRBO])IA$ == $1IIA$ == $1IYA$ - */ + } + + /* russian last names II: + * ([NRBO])IA$ == $1IIA$ == $1IYA$ + */ static Pattern p2 = Pattern.compile("(?<=[NRBOnrbo])[Ii](?=[Aa],)"); static HashSet translitRussianLastNames2(Iterator itr) { HashSet syn = new HashSet(); String x; while (itr.hasNext()) { - x = itr.next(); + x = itr.next(); Matcher m = p2.matcher(x); if (m.find()) { - if (x.charAt(m.start()) == 'I') { - syn.add(m.replaceAll("II")); - syn.add(m.replaceAll("IY")); - } - else { - syn.add(m.replaceAll("ii")); - syn.add(m.replaceAll("iy")); - } + if (x.charAt(m.start()) == 'I') { + syn.add(m.replaceAll("II")); + syn.add(m.replaceAll("IY")); + } + else { + syn.add(m.replaceAll("ii")); + syn.add(m.replaceAll("iy")); + } } } //log.debug("last names II: " + syn); return syn; } - /* russian last names III: - * ([DHKLMNPSZ])IAN$ == $1YAN$ == $1JAN$ - */ + /* russian last names III: + * ([DHKLMNPSZ])IAN$ == $1YAN$ == $1JAN$ + */ static Pattern p3 = Pattern.compile("(?<=[DHKLMNPSZdhklmnpsz])[IJYijy](?=[Aa][Nn],)"); static HashSet translitRussianLastNames3(Iterator itr) { HashSet syn = new HashSet(); String x; while (itr.hasNext()) { - x = itr.next(); + x = itr.next(); Matcher m = p3.matcher(x); if (m.find()) { - if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'J' || x.charAt(m.start()) == 'Y') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("J")); - syn.add(m.replaceAll("Y")); - } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("j")); - syn.add(m.replaceAll("y")); - } + if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'J' || x.charAt(m.start()) == 'Y') { + syn.add(m.replaceAll("I")); + syn.add(m.replaceAll("J")); + syn.add(m.replaceAll("Y")); + } + else { + syn.add(m.replaceAll("i")); + syn.add(m.replaceAll("j")); + syn.add(m.replaceAll("y")); + } } } //log.debug("last names III: " + syn); return syn; } - - /* russian last names IV: - * AIA$ == AYA$ == AJA$ - */ + + /* russian last names IV: + * AIA$ == AYA$ == AJA$ + */ static Pattern p4 = Pattern.compile("(?<=[KNVknv][Aa])[IJYijy](?=[Aa],)"); static HashSet translitRussianLastNames4(Iterator itr) { HashSet syn = new HashSet(); String x; while (itr.hasNext()) { - x = itr.next(); + x = itr.next(); Matcher m = p4.matcher(x); if (m.find()) { - if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'J' || x.charAt(m.start()) == 'Y') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("J")); - syn.add(m.replaceAll("Y")); - } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("j")); - syn.add(m.replaceAll("y")); - } + if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'J' || x.charAt(m.start()) == 'Y') { + syn.add(m.replaceAll("I")); + syn.add(m.replaceAll("J")); + syn.add(m.replaceAll("Y")); + } + else { + syn.add(m.replaceAll("i")); + syn.add(m.replaceAll("j")); + syn.add(m.replaceAll("y")); + } } } //log.debug("last names IV: " + syn); return syn; } - - /* russian last names V: - * KI$ == KII$ == KIJ$ == KIY$ = KYI$ - * VI$ == VII$ == VIJ$ == VIY$ = VYI$ - * first transform [KVH]I into [KVH]II - */ + + /* russian last names V: + * KI$ == KII$ == KIJ$ == KIY$ = KYI$ + * VI$ == VII$ == VIJ$ == VIY$ = VYI$ + * first transform [KVH]I into [KVH]II + */ static Pattern p5 = Pattern.compile("(?<=[KVkv])[Ii](?=,)"); static HashSet translitRussianLastNames5(Iterator itr) { HashSet syn = new HashSet(); String x; while (itr.hasNext()) { - x = itr.next(); + x = itr.next(); Matcher m = p5.matcher(x); if (m.find()) { - if (x.charAt(m.start()) == 'I') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("Y")); - syn.add(m.replaceAll("YI")); - syn.add(m.replaceAll("IY")); - syn.add(m.replaceAll("IJ")); - syn.add(m.replaceAll("II")); - } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("y")); - syn.add(m.replaceAll("yi")); - syn.add(m.replaceAll("iy")); - syn.add(m.replaceAll("ij")); - syn.add(m.replaceAll("ii")); - } + if (x.charAt(m.start()) == 'I') { + syn.add(m.replaceAll("I")); + syn.add(m.replaceAll("Y")); + syn.add(m.replaceAll("YI")); + syn.add(m.replaceAll("IY")); + syn.add(m.replaceAll("IJ")); + syn.add(m.replaceAll("II")); + } + else { + syn.add(m.replaceAll("i")); + syn.add(m.replaceAll("y")); + syn.add(m.replaceAll("yi")); + syn.add(m.replaceAll("iy")); + syn.add(m.replaceAll("ij")); + syn.add(m.replaceAll("ii")); + } } } //log.debug("last names V: " + syn); return syn; } - + /* russian first names * ^IU == ^YU * ^IA == ^YA @@ -416,21 +353,21 @@ static HashSet translitRussianFirstNames(Iterator itr) { HashSet syn = new HashSet(); String x; while (itr.hasNext()) { - x = itr.next(); + x = itr.next(); Matcher m = p6.matcher(x); if (m.find()) { - if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'Y') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("Y")); - } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("y")); - } + if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'Y') { + syn.add(m.replaceAll("I")); + syn.add(m.replaceAll("Y")); + } + else { + syn.add(m.replaceAll("i")); + syn.add(m.replaceAll("y")); + } } } //log.debug("first names: " + syn); return syn; } - + } diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/NameParser.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/NameParser.java index 1b280ab2a..1864dbe10 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/NameParser.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/NameParser.java @@ -1,292 +1,25 @@ package org.apache.solr.analysis.author; -//Thanks to Robert Cooper for this! -//package com.totsp.bookworm.util; +import java.util.Map; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; +import org.jython.JythonObjectFactory; +import org.jython.monty.interfaces.JythonNameParser; -import org.apache.commons.lang.StringUtils; -/** -* -* @author kebernet -*/ public class NameParser { - - private static final Set TITLES = new HashSet(); - private static final Set SUFFIXES = new HashSet(); - private static final Set COMPOUND_NAMES = new HashSet(); - public static final int TITLE = 0; - public static final int FIRST_NAME = 1; - public static final int MIDDLE_NAME = 2; - public static final int LAST_NAME = 3; - public static final int SUFFIX = 4; - - static { - for (String title : new String[] { "dr.", "dr", "doctor", "mr.", "mr", "mister", "ms.", "ms", "miss", "mrs.", - "mrs", "mistress", "hn.", "hn", "honorable", "the", "honorable", "his", "her", "honor", "fr", "fr.", - "frau", "hr", "herr", "rv.", "rv", "rev.", "rev", "reverend", "reverend", "madam", "lord", "lady", - "sir", "senior", "bishop", "rabbi", "holiness", "rebbe", "deacon", "eminence", "majesty", "consul", - "vice", "president", "ambassador", "secretary", "undersecretary", "deputy", "inspector", "ins.", - "detective", "det", "det.", "constable", "private", "pvt.", "pvt", "petty", "p.o.", "po", "first", - "class", "p.f.c.", "pfc", "lcp.", "lcp", "corporal", "cpl.", "cpl", "colonel", "col", "col.", - "capitain", "cpt.", "cpt", "ensign", "ens.", "ens", "lieutenant", "lt.", "lt", "ltc.", "ltc", - "commander", "cmd.", "cmd", "cmdr", "rear", "radm", "r.adm.", "admiral", "adm.", "adm", "commodore", - "cmd.", "cmd", "general", "gen", "gen.", "ltgen", "lt.gen.", "maj.gen.", "majgen.", "major", "maj.", - "mjr", "maj", "seargent", "sgt.", "sgt", "chief", "cf.", "cf", "petty", "officer", "c.p.o.", "cpo", - "master", "cmcpo", "fltmc", "formc", "mcpo", "mcpocg", "command", "fleet", "force" }) { - NameParser.TITLES.add(title); - } - - for (String suffix : new String[] { "jr.", "jr", "junior", "ii", "iii", "iv", "senior", "sr.", "sr", //family - "phd", "ph.d", "ph.d.", "m.d.", "md", "d.d.s.", "dds", // doctors - "k.c.v.o", "kcvo", "o.o.c", "ooc", "o.o.a", "ooa", "g.b.e", "gbe", // knighthoods - "k.b.e.", "kbe", "c.b.e.", "cbe", "o.b.e.", "obe", "m.b.e", "mbe", // cont - "esq.", "esq", "esquire", "j.d.", "jd", // lawyers - "m.f.a.", "mfa", //misc - "r.n.", "rn", "l.p.n.", "lpn", "l.n.p.", "lnp", //nurses - "c.p.a.", "cpa", //money men - "d.d.", "dd", "d.div.", "ddiv", //preachers - "ret", "ret." }) { - NameParser.SUFFIXES.add(suffix); - } - - for (String comp : new String[] { "de", "la", "st", "st.", "ste", "ste.", "saint", "van", "der", "al", "bin", - "le", "mac", "di", "del", "vel", "von", "e'", "san", "af", "el", "\'t" }) { - NameParser.COMPOUND_NAMES.add(comp); - } - } + private JythonNameParser jythonParser; + public NameParser() { + JythonObjectFactory factory = new JythonObjectFactory(JythonNameParser.class, "jython_name_parser", "HumanParser"); + this.jythonParser = (JythonNameParser) factory.createObject(); + } + /** * This method will parse a name into first middle and last names. - *

- * Notes: "Al" is treated as a name. "al" as a name fragment. That is the - * only exception for capitalization. - *

- * @param name name to parse - * @return String[5] containing title, first, middle and last names, suffix + * @return Map containing title, first, middle and last names, suffix */ - public String[] parseName(String name) { - // NOTE Add lookahead for Suffixes to support - // "Winthrop Wolfcasts, the 31st Duke of Winchester" - String[] result = new String[5]; - - if (name == null) { - return result; - } - - StringBuffer title = new StringBuffer(); - StringBuffer first = new StringBuffer(); - StringBuffer middle = new StringBuffer(); - StringBuffer last = new StringBuffer(); - StringBuffer suffix = new StringBuffer(); - boolean isLastCommaFirst = false; - - if (name.indexOf(",") != -1) { - String[] lastRest = name.split(","); - - if (lastRest.length > 2) { - isLastCommaFirst = true; - } else if (lastRest.length > 1) { - String[] suffixes = lastRest[1].toLowerCase().trim().split(" "); - - for (String check : suffixes) { - if (!NameParser.SUFFIXES.contains(check)) { - isLastCommaFirst = true; - - break; - } - } - } else if (lastRest.length == 1) { - name = name.replaceFirst(",$", ""); - } - } - - if (isLastCommaFirst) // the user split the last name - { - ArrayList lastRest = new ArrayList(Arrays.asList(name.split(","))); - -// if (lastRest.size() > 2) { -// for (int i = 2; i < lastRest.length; i++) //append the remaining elements to the end of the second element -// { -// lastRest[1] += (" " + lastRest[i]); -// } -// } - - result[NameParser.LAST_NAME] = lastRest.remove(0).trim(); - - if ((lastRest.size() == 1) && (lastRest.get(0).trim().indexOf(" ") == -1)) // easy case - { - result[NameParser.FIRST_NAME] = lastRest.remove(0).trim(); - - return result; - } else { - // join the rest together and split again on whitespace - ArrayList rest = new ArrayList(Arrays.asList(StringUtils.join(lastRest, " ").trim().split("\\s+"))); - - //parse titles - for (int i = 0; i < rest.size(); i++) { - if (NameParser.TITLES.contains(rest.get(i).toLowerCase().trim())) { - title.append(rest.remove(i)); - } - } - - if (title.length() > 0) { - result[NameParser.TITLE] = title.toString(); - } - - //parse suffixes - for (int i = 0; i < rest.size(); i++) { - if (NameParser.SUFFIXES.contains(rest.get(i).toLowerCase().trim())) { - suffix.insert(0, rest.remove(i)); - } - } - - if (suffix.length() > 0) { - result[NameParser.SUFFIX] = suffix.toString(); - } - - int[] nextNameOrder = new int[] { NameParser.FIRST_NAME, NameParser.MIDDLE_NAME }; - int nextNameIndex = 0; - - for (int i = 0; i < rest.size(); i++) { - StringBuffer nextName = new StringBuffer(); - - while (!rest.get(i).trim().equals("Al") && NameParser.COMPOUND_NAMES.contains(rest.get(i).toLowerCase().trim())) { - nextName.append(rest.get(i).trim()); - - if (i != (rest.size() - 1)) { - nextName.append(' '); - } - - i++; - - if (i == (rest.size() - 1)) { - break; - } - } - - nextName.append(rest.get(i)); - if (nextNameIndex < nextNameOrder.length) { - result[nextNameOrder[nextNameIndex]] = nextName.toString(); - } else { - result[nextNameOrder[nextNameOrder.length - 1]] += " " + nextName.toString(); - } - nextNameIndex++; - - -// if (nextNameIndex == nextNameOrder.length) { -// for (int j = i + 1; j < tail; j++) { -// if (j != (i + 1)) { -// nextName.append(' '); -// } -// -// nextName.append(rest[j]); -// } -// -// result[nextNameOrder[nextNameIndex - 1]] = nextName.toString(); -// -// break; -// } - } - } - } // end last, first case. - else { - String[] names = name.split(" "); - int head = 0; - int tail = names.length - 1; - - //parse titles - for (int i = head; (i < tail) && NameParser.TITLES.contains(names[i].toLowerCase().trim()); i++) { - if (i != 0) { - title.append(' '); - } - - title.append(names[i]); - head++; - } - - if (title.length() > 0) { - result[NameParser.TITLE] = title.toString(); - } - - //parse suffixes - for (int i = tail; (i >= head) && NameParser.SUFFIXES.contains(names[i].toLowerCase().trim()); i--) { - if (i != tail) { - suffix.insert(0, ' '); - } - - suffix.insert(0, names[i]); - tail--; - } - - if (suffix.length() > 0) { - result[NameParser.SUFFIX] = suffix.toString(); - names[tail] = names[tail].replaceAll(",", ""); - } - - if (head == tail) { //Only one name left - - if (names[head].trim().length() > 0) { - result[NameParser.LAST_NAME] = names[head]; - } - } else { - //parse last name - last.append(names[tail]); - tail--; - - for (int i = tail; (i >= head) && !names[i].trim().equals("Al") - && NameParser.COMPOUND_NAMES.contains(names[i].toLowerCase().trim()); i--) { - last.insert(0, ' '); - - last.insert(0, names[i]); - tail--; - } - - boolean firstPass = true; - - //parse first name - for (int i = head; i <= tail; i++) { - if (!firstPass) { - first.append(' '); - } - - first.append(names[i].trim()); - head++; - firstPass = false; - - if (names[i].trim().equals("Al") || !NameParser.COMPOUND_NAMES.contains(names[i].trim().toLowerCase())) { - break; - } - } - - //build middle name - for (int i = head; i <= tail; i++) { - if (i != head) { - middle.append(' '); - } - - middle.append(names[i].trim()); - } - } - - if (first.length() > 0) { - result[NameParser.FIRST_NAME] = first.toString().trim(); - } - - if (last.length() > 0) { - result[NameParser.LAST_NAME] = last.toString().trim(); - } - - if (middle.length() > 0) { - result[NameParser.MIDDLE_NAME] = middle.toString().trim(); - } - } - - return result; + public Map parseName(String name) { + return jythonParser.parse_human_name(name); } } diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml index 3ec97fd0f..d77125007 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml @@ -528,7 +528,7 @@ - + @@ -536,14 +536,15 @@ expression --> + replacement="$1$3" /> + pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" /> + pattern="\b(?i:(NGC|N)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" /> + replacement="$1$3" /> + @@ -561,6 +562,7 @@ catenateNumbers="0" catenateAll="1" splitOnCaseChange="0" splitOnNumerics="0" stemEnglishPossessive="1" preserveOriginal="0" /> + @@ -890,7 +892,6 @@ types="wdafftypes.txt" /> - - - - @@ -1291,6 +1292,9 @@ docValues="true"/> + + - - - - - - diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml b/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml index f6973a967..96d511ee8 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml @@ -5,6 +5,7 @@ 6.0 + @@ -34,12 +35,13 @@ + ${solr.maxIndexingThreads:8} + ${solr.useCompoundFile:false} ${solr.ramBufferSize:1000} ${solr.maxBufferedDocs:50000} - - - ${montysolr.autoSoftCommit.maxTime:-1} - first_author^0.9 author^0.85 year^0.8 title^0.8 abstract^0.7 identifier^0.8 bibstem^0.8 keyword^0.8 @@ -342,8 +338,7 @@ 0.5 first_author^14 author^13 year^10 bibstem^10 SYNONYM - aff_raw,aff_id,institution - 2 + AND @@ -362,7 +357,7 @@ explicit 10 - first_author^0.9 author^0.85 year^0.8 title^0.8 abstract^0.7 identifier^0.8 bibstem^0.8 keyword^0.8 + first_author^5 author^2 title^1.5 abstract^1.3 identifier^1 bibstem^1 year^2 aqp disjuncts simple @@ -372,8 +367,7 @@ 0.5 first_author^14 author^13 year^10 bibstem^10 SYNONYM - aff_raw,aff_id,institution - 2 + AND unfielded_search @@ -419,7 +413,7 @@ Make sure these defaults are set also in other public query handlers (e.g. tvrh - used by the word cloud) --> - first_author^0.9 author^0.85 year^0.8 title^0.8 abstract^0.7 identifier^0.8 bibstem^0.8 keyword^0.8 + first_author^5 author^2 title^1.5 abstract^1.3 identifier^1 bibstem^1 year^2 aqp disjuncts simple @@ -427,7 +421,7 @@ edismax_combined_aqp true true - 2 + AND unfielded_search @@ -700,7 +694,7 @@ AND arxiv identifier;collection database entdate entry_date;pubdate date;author_nosyn author_notrans author_nosyn_notrans author - author^1.5 title^1.4 abstract^1.3 all + author^1.5 title^1.4 abstract^1.3 @@ -724,7 +718,7 @@ AND arxiv identifier;collection database entdate entry_date;pubdate date;author_nosyn author_notrans author_nosyn_notrans author;title_nosyn title;alternate_title_nosyn alternate_title;abstract_nosyn abstract;all_nosyn all;full_nosyn full;body_nosyn body;ack_nosyn ack;keyword_nosyn keyword - author^1.5 title^1.4 abstract^1.3 all + author^1.5 title^1.4 abstract^1.3 yyyy-MM-dd'T'HH:mm:ss yyyy-MM-dd'T'HH:mm:ss.SSS entry_date,date @@ -757,7 +751,7 @@ AND arxiv identifier;collection database entdate entry_date;pubdate date;author_nosyn author_notrans author_nosyn_notrans author;title_nosyn title;alternate_title_nosyn alternate_title;abstract_nosyn abstract;all_nosyn all;full_nosyn full;body_nosyn body;ack_nosyn ack;keyword_nosyn keyword - author^1.5 title^1.4 abstract^1.3 all + author^1.5 title^1.4 abstract^1.3 yyyy-MM-dd'T'HH:mm:ss yyyy-MM-dd'T'HH:mm:ss.SSS entry_date,date @@ -836,7 +830,7 @@ AND arxiv identifier;collection database entdate entry_date;pubdate date;author_nosyn author_notrans author_nosyn_notrans author;title_nosyn title;alternate_title_nosyn alternate_title;abstract_nosyn abstract;all_nosyn all;full_nosyn full;body_nosyn body;ack_nosyn ack;keyword_nosyn keyword - author^1.5 title^1.4 abstract^1.3 all + author^1.5 title^1.4 abstract^1.3 yyyy-MM-dd'T'HH:mm:ss yyyy-MM-dd'T'HH:mm:ss.SSS entry_date,date