Skip to content

Commit

Permalink
One implementation of the synonym->transliteration fix (jones>forman …
Browse files Browse the repository at this point in the history
…situation is failing)
  • Loading branch information
romanchyla committed Jan 22, 2021
1 parent e0c2fb6 commit f509f83
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -448,5 +448,146 @@ public TokenStream create(TokenStream input) {
// NewSynonymFilteFactory
return input;
}

public static class SimpleShortNames extends NewSynonymFilterFactory.SynonymBuilderFactory {

public SimpleShortNames(Map<String,String> args) {
super(args);
}

protected SynonymParser getParser(Analyzer analyzer) {

char sep = ',';
if (args.containsKey("format") && args.get("format").equals("semicolon")) {
sep = ';';
};

final Character charSeparator = sep;

return new NewSolrSynonymParser(true, true, analyzer) {

public void add(Reader in) throws IOException, ParseException {
LineNumberReader br = new LineNumberReader(in);
StringBuffer newBr = new StringBuffer();
String line = null;

String[] parts;

try {
while ((line = br.readLine()) != null) {
// modify the original on-the-fly
if (line.length() == 0 || line.charAt(0) == '#') {
continue; // ignore empty lines and comments
}
String[] sides = line.split("=>");
if (sides.length > 1) { // explicit mapping
String[] names = getNames(sides[1]);
newBr.append(escape(names[0]));
newBr.append("=>");
boolean first = false;
for (String n: names) {
if (first)
newBr.append(',');
newBr.append(escape(n));
first = true;
}
}
else {
String[] names = getNames(sides[0]);
newBr.append(buildLine(names));
}
newBr.append("\n");
}
} catch (IllegalArgumentException e) {
ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
ex.initCause(e);
throw ex;
} finally {
br.close();
}

// pass the modified synonym to the builder to create a synonym map
super.add(new InputStreamReader(new ByteArrayInputStream(newBr.toString().getBytes()),
Charset.forName("UTF-8").newDecoder()));

}
@Override
public void add(CharsRef input, CharsRef output, boolean includeOrig) {
super.add(input, output, true);
}

private String[] getNames(String vals) {
List<String> nn = StrUtils.splitSmart(vals, charSeparator);
String names[] = new String[nn.size()];
int j = 0;
for (String n: nn) {
names[j] = unescape(n);
j++;
}
return names;
}
private String buildLine(String[] names) {
HashSet<String> set = new HashSet<String>();
StringBuilder out = new StringBuilder();
boolean notFirst = false;

for (String name: names) {

String[] p = AuthorUtils.splitName(name);
if (isLongForm(p)) {
set.add(makeShortForm(p));
}
set.add(name);
}
for (String name: set) {
if (notFirst) out.append(',');
out.append(escape(name));
notFirst = true;
}
return out.toString();
}


private String unescape(String s) {
return s.replace("\\ ", " ").replace("\\" + charSeparator, charSeparator.toString());
}


private String escape(String s) {
return s.replace(" ", "\\ ").replace(",", "\\,");
}


private String makeShortForm(String[] parts) {
StringBuilder out = new StringBuilder();
out.append(parts[0]);
for (int i=1;i<parts.length;i++) {
out.append(" ");
out.append(parts[i].substring(0, 1));
}
return out.toString();
}

private boolean isLongForm(String[] parts) {
boolean res = false;
for (int i=1;i<parts.length;i++) {
if (parts[i].length() > 1)
return true;
}
return res;
}
private int containsLongForm(String[] names) {
int i = 0;
for (String name: names) {
if (isLongForm(AuthorUtils.splitName(name))) {
i++;
}
}
return i;
}
};
}
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,9 @@ public void testADSOperators() throws Exception {
LuceneCacheWrapper<NumericDocValues> boostTwo = LuceneCacheWrapper.getFloatCache(
"boost_2", UninvertingReader.Type.FLOAT_POINT, tempReq.getSearcher().getSlowAtomicReader());


assertEquals("Unexpected value from cache", 1.0f, boostConstant.getFloat(0), 0.0f);
assertEquals("Unexpected value from cache", 0.1f, boostOne.getFloat(0), 0.0f);
assertEquals("Unexpected value from cache", 0.5f, boostTwo.getFloat(0), 0.0f);
assertEquals("wrong data", 1.0f, boostConstant.getFloat(0), 0.0f);
assertEquals("wrong data", 0.1f, boostOne.getFloat(0), 0.0f);
assertEquals("wrong data", 0.5f, boostTwo.getFloat(0), 0.0f);

// expecting 4 results with various order, simply based on the boost factor
testQ2("id:1", new SecondOrderCollectorOperatorExpertsCiting(referencesWrapper, boostConstant),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,15 @@ public static String getSchemaFile() {
"ADAMŠuk, m; ADAMGuk, m;ADAMČuk, m", // hand-made additions
"MÜLLER, A WILLIAM;MÜLLER, A BILL",
"MÜLLER, WILLIAM;MÜLLER, BILL",
"JONES, CHRISTINE;FORMAN, CHRISTINE", // the famous post-synonym expansion
//"JONES, CHRISTINE;FORMAN, CHRISTINE", // the famous post-synonym expansion
"JONES, C=>Jones,Christine;FORMAN, CHRISTINE", // the famous post-synonym expansion
"FORMAN, C=>FORMAN, CHRISTINE;JONES, C", // the famous post-synonym expansion
"DE ZEEUW, TIM=>DE ZEEUW, P TIM",
"DE ZEEUW, P TIM=>DE ZEEUW, TIM;DE ZEEUW,",
"grant, carolyn s; stern grant, carolyn; stern, carolyn p",
"orlitova, ivana; stoklasova, ivana",
"orlitova,; stoklasova,"
"orlitova,; stoklasova,",
"wedemeyer boehm, s; wedemeyer, s"
});

// automatically harvested variations of author names (collected during indexing)
Expand Down Expand Up @@ -165,7 +168,13 @@ public static String getSchemaFile() {
"Gonzalez Alfonso, E=>González Alfonso, E",
"Chyelkovae,=>Chýlková,",
"stoklasova,=>stoklasová,",
"orlitova,=>orlitová,"
"orlitova,=>orlitová,",
"wedemeyer boehm, s=>wedemeyer böhm, s",
"wedemeyer boehm, sven=>wedemeyer böhm, sven",
"wedemeyer boehm,=>wedemeyer böhm,",
"wedemeyer bohm, s=>wedemeyer böhm, s",
"wedemeyer bohm, sven=>wedemeyer böhm, sven",
"wedemeyer bohm,=>wedemeyer böhm,"
}
));

Expand Down Expand Up @@ -378,6 +387,32 @@ public void xtestX() throws Exception {

public void testAuthorParsingUseCases() throws Exception {

assertQueryEquals(req("q", "author:\"Wedemeyer, Sven\""),
"wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s * | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s * | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s * | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s * | wedemeyer, sven | wedemeyer, sven *",
DisjunctionMaxQuery.class);
assertQueryEquals(req("q", "author:\"wedemeyer, sven\""),
"wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s * | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s * | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s * | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s * | wedemeyer, sven | wedemeyer, sven *",
DisjunctionMaxQuery.class);
assertQueryEquals(req("q", "author:\"wedemeyer böhm, sven\""),
"wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s * | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s * | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s * | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s * | wedemeyer, sven | wedemeyer, sven *",
DisjunctionMaxQuery.class);
assertQueryEquals(req("q", "author:\"wedemeyer böhm, s\""),
"wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s* | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s* | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s* | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s* | wedemeyer, sven | wedemeyer, sven *",
DisjunctionMaxQuery.class);
assertQueryEquals(req("q", "author:\"wedemeyer, s\""),
"wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s* | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s* | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s* | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s* | wedemeyer, sven | wedemeyer, sven *",
DisjunctionMaxQuery.class);
assertQueryEquals(req("q", "author:\"wedemeyer böhm, s\""),
"wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s* | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s* | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s* | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s* | wedemeyer, sven | wedemeyer, sven *",
DisjunctionMaxQuery.class);

// wedemeyer boehm, sven; wedemeyer, sven
// (author:wedemeyer boehm, sven | author:wedemeyer boehm, sven * | author:wedemeyer boehm, s | author:wedemeyer boehm, s * | author:wedemeyer boehm, | author:wedemeyer, sven | author:wedemeyer, sven * | author:wedemeyer, s | author:wedemeyer, s * | author:wedemeyer,)
// wedemeyer boehm, s; wedemeyer, s
// (author:wedemeyer boehm, s | author:wedemeyer boehm, s * | author:wedemeyer boehm, | author:wedemeyer boehm, sven | author:wedemeyer boehm, sven * | author:wedemeyer, s | author:wedemeyer, s * | author:wedemeyer, | author:wedemeyer, sven | author:wedemeyer, sven *)
// wedemeyer boehm, s; wedemeyer boehm, sven; wedemeyer, s; wedemeyer, sven
// (author:wedemeyer, sven | author:wedemeyer, sven * | author:wedemeyer, s | author:wedemeyer, s * | author:wedemeyer, | author:wedemeyer boehm, s | author:wedemeyer boehm, s * | author:wedemeyer boehm,)

assertQueryEquals(req("q", "author:\"van dok*, h\""), "author:van dok*, h", WildcardQuery.class);
assertQ(req("q", "author:\"van dok*, h\""),
"//*[@numFound='1']",
Expand Down Expand Up @@ -437,7 +472,7 @@ public void testAuthorParsingUseCases() throws Exception {
// expected:
// | author:orlitova, | author:stoklasová,* | author:orlitova, ivana | author:orlitova, ivana * | author:stoklasova, i | author:stoklasova, i * | author:stoklasova, ivana | author:stoklasova, ivana * | author:orlitova, i | author:orlitova, i * | author:orlitova,* | author:stoklasova, | author:stoklasova,* | author:orlitová, | author:orlitová,* | author:orlitovae, | author:orlitovae,* | author:stoklasová, | author:stoklasovae, | author:stoklasovae,*
// TODO: optimize the query, remove the clauses that match the doc twice

setDebug(true);
testAuthorQuery("\"stoklasova\"",
"author:orlitova, | author:stoklasová, | author:orlitova, ivana | author:stoklasova, i | author:stoklasova, ivana | author:orlitova, i | author:orlitova,* | author:stoklasova, | author:stoklasova,* | author:orlitová, | author:orlitová,* | author:orlitovae, | author:orlitovae,* | author:stoklasová,* | author:stoklasovae, | author:stoklasovae,*",
"//*[@numFound='0']");
Expand Down Expand Up @@ -2414,56 +2449,64 @@ public void testAuthorParsingMainLogic() throws Exception {
*
*/

setDebug(true);
testAuthorQuery(
//must NOT have "jones*", must have "jones, c;jones, christine"
"forman", "author:forman, | author:forman, c | author:jones, christine | author:jones, c " +
"author:forman, christine | author:forman,*",
"//*[@numFound='7']",
"//*[@numFound='7']"
// forman numFound=7
// 110 Jones, Christine 111 Jones, C 112 Forman, Christine
// 113 Forman, C 115 Jones, C 116 Forman, Christopher
// 117 Forman, C
// 117 Forman, C
);
testAuthorQuery(
//must NOT have "forman*", must have "forman, c;forman, christine"
// PLUS - must have other jones's and allen's
"jones", "author:jones, | author:jones, l | author:allen, l | author:allen, r l " +
"author:allen, lynne | author:jones, r l | author:jones, r lynne | author:jones, lynne " +
"author:allen, r lynne | author:forman, c | author:jones, christine | author:jones, c " +
"author:forman, christine | author:jones,*",
"//*[@numFound='15']",
"//*[@numFound='15']"
// jones numFound=15
// 110 Jones, Christine 111 Jones, C 112 Forman, Christine
// 113 Forman, C 114 Jones, Christopher 115 Jones, C
// 117 Forman, C 120 Allen, Lynne 121 Allen, L
// 122 Allen, R Lynne 123 Allen, R L 124 Jones, Lynne
// 125 Jones, L 126 Jones, R Lynne 127 Jones, R L
// 125 Jones, L 126 Jones, R Lynne 127 Jones, R L
);
testAuthorQuery(
//must NOT have "jones, c*", must have "jones, christine"
"\"forman, c\"", "author:forman, c | author:forman, christine | author:forman, c* | author:forman," +
"author:jones, christine | author:jones, c",
"//*[@numFound='7']",
"//*[@numFound='7']"
// "forman, c" numFound=7
// 110 Jones, Christine 111 Jones, C 112 Forman, Christine
// 113 Forman, C 115 Jones, C 116 Forman, Christopher
// 117 Forman, C

);
testAuthorQuery(
//must NOT have "forman, c*", must have "forman, christine"
"\"jones, c\"", "author:jones, c | author:jones, christine | author:jones, c* | author:jones," +
"author:forman, christine | author:forman, c",
"//*[@numFound='7']",
"//*[@numFound='7']"
// "jones, c" numFound=7
// 110 Jones, Christine 111 Jones, C 112 Forman, Christine
// 113 Forman, C 114 Jones, Christopher 115 Jones, C
// 117 Forman, C

);
testAuthorQuery(
"\"jones, christine\"",
"author:jones, christine | author:jones, christine * | author:jones, c " +
"author:jones, c * | author:jones, | author:forman, christine " +
"author:forman, christine * | author:forman, c | author:forman, c * " +
"author:forman,",
"//*[@numFound='6']",
"//*[@numFound='6']"
// "jones, christine" numFound=6
// 110 Jones, Christine 111 Jones, C 112 Forman, Christine
// 113 Forman, C 115 Jones, C 117 Forman, C

);
testAuthorQuery(
"\"forman, christine\"", "author:jones, christine | author:jones, christine * | author:jones, c " +
"author:jones, c * | author:jones, | author:forman, christine | author:forman, christine * " +
"author:forman, c | author:forman, c * | author:forman,",
Expand Down
Loading

0 comments on commit f509f83

Please sign in to comment.