Skip to content

Commit

Permalink
[NOID] Add new regex matcher by group names
Browse files Browse the repository at this point in the history
  • Loading branch information
gem-neo4j committed Aug 14, 2024
1 parent 9e7737d commit cbf76bb
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 3 deletions.
48 changes: 48 additions & 0 deletions core/src/main/java/apoc/text/Strings.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
Expand All @@ -42,6 +43,7 @@
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -182,6 +184,52 @@ public List<List<String>> regexGroups(final @Name("text") String text, final @Na
}
}

@UserFunction("apoc.text.regexGroupsByName")
@Description("Returns all groups with their group name matching the given regular expression in the given text.")
public List<Map<String, Object>> regexGroupsByName(
final @Name("text") String text, final @Name("regex") String regex) {
if (text == null || regex == null) {
return Collections.EMPTY_LIST;
} else {
List<Map<String, Object>> result = new ArrayList<>();
try {
final Pattern pattern = Pattern.compile(regex);

final Matcher matcher = pattern.matcher(text);
List<String> namedGroups = getNamedGroups(regex);
while (matcher.find()) {
Map<String, Object> matchGroupResult = new HashMap<>();
matchGroupResult.put("group", matcher.group());
Map<String, Object> matches = new HashMap<>();
for (String groupName : namedGroups) {
String match = matcher.group(groupName);
if (match != null) {
matches.put(groupName, match);
}
}
matchGroupResult.put("matches", matches);
result.add(matchGroupResult);
}
} catch (PatternSyntaxException e) {
throw new RuntimeException("Invalid regex pattern: " + e.getMessage());
}
return result;
}
}

private List<String> getNamedGroups(String text) {
List<String> namedGroups = new ArrayList<>();

Matcher mG = Pattern.compile("\\(\\?<(.+?)>").matcher(text);

while (mG.find()) {
for (int i = 1; i <= mG.groupCount(); i++) {
namedGroups.add(mG.group(i));
}
}
return namedGroups;
}

@UserFunction("apoc.text.join")
@Description("Joins the given `STRING` values using the given delimiter.")
public String join(final @Name("texts") List<String> texts, final @Name("delimiter") String delimiter) {
Expand Down
145 changes: 142 additions & 3 deletions core/src/test/java/apoc/text/StringsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import static java.lang.Math.toIntExact;
import static java.util.Arrays.asList;
import static java.util.Collections.singletonList;
import static junit.framework.TestCase.assertEquals;
import static junit.framework.TestCase.assertTrue;
import static org.junit.Assert.*;

import apoc.util.TestUtil;
Expand All @@ -35,6 +37,7 @@
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.ClassRule;
Expand Down Expand Up @@ -67,9 +70,12 @@ public static void teardown() {

@Test
public void testIndexOfSubstring() {
String query = "WITH 'Hello World!' as text\n"
+ "WITH text, size(text) as len, apoc.text.indexOf(text, 'World',3) as index\n"
+ "RETURN substring(text, case index when -1 then len-1 else index end, len) as value;\n";
String query =
"""
WITH 'Hello World!' as text
WITH text, size(text) as len, apoc.text.indexOf(text, 'World',3) as index
RETURN substring(text, case index when -1 then len-1 else index end, len) as value;
""";
testCall(db, query, (row) -> assertEquals("World!", row.get("value")));
}

Expand Down Expand Up @@ -532,6 +538,139 @@ public void testRegexGroups() {
});
}

@Test
public void singleGroupNyName() {
testResult(
db,
"RETURN apoc.text.regexGroupsByName('tenable_asset','(?<firstPart>\\w+)\\_(?<secondPart>\\w+)') AS result",
result -> {
final List<Object> r = Iterators.single(result.columnAs("result"));

List<Map<String, Object>> expected = new ArrayList<>(List.of(Map.of(
"group",
"tenable_asset",
"matches",
Map.of("firstPart", "tenable", "secondPart", "asset"))));
assertTrue(r.containsAll(expected));
});
}

@Test
public void multipleGroupsNyName() {
testResult(
db,
"RETURN apoc.text.regexGroupsByName('abc <link xxx1>yyy1</link> def <link xxx2>yyy2</link>','<link (?<firstPart>\\\\w+)>(?<secondPart>\\\\w+)</link>') AS result",
result -> {
final List<Object> r = Iterators.single(result.columnAs("result"));

List<Map<String, Object>> expected = new ArrayList<>(List.of(
Map.of(
"group",
"<link xxx1>yyy1</link>",
"matches",
Map.of("firstPart", "xxx1", "secondPart", "yyy1")),
Map.of(
"group",
"<link xxx2>yyy2</link>",
"matches",
Map.of("firstPart", "xxx2", "secondPart", "yyy2"))));
assertTrue(r.containsAll(expected));
});
}

@Test
public void groupNyNameWithMissingFirstGroup() {
testResult(
db,
"RETURN apoc.text.regexGroupsByName('_asset','(?<firstPart>\\w+)?\\_(?<secondPart>\\w+)') AS result",
result -> {
final List<Object> r = Iterators.single(result.columnAs("result"));

List<Map<String, Object>> expected = new ArrayList<>(
List.of(Map.of("group", "_asset", "matches", Map.of("secondPart", "asset"))));
assertTrue(r.containsAll(expected));
});
}

@Test
public void groupNyNameWithMissingSecondGroup() {
testResult(
db,
"RETURN apoc.text.regexGroupsByName('asset_','(?<firstPart>\\w+)?\\_(?<secondPart>\\w+)?') AS result",
result -> {
final List<Object> r = Iterators.single(result.columnAs("result"));

List<Map<String, Object>> expected = new ArrayList<>(
List.of(Map.of("group", "asset_", "matches", Map.of("firstPart", "asset"))));
assertTrue(r.containsAll(expected));
});
}

@Test
public void groupNyNameNoMatches() {
testResult(
db,
"RETURN apoc.text.regexGroupsByName('hello','(?<firstPart>\\w+)?\\_(?<secondPart>\\w+)?') AS result",
result -> {
final List<Object> r = Iterators.single(result.columnAs("result"));

List<Map<String, Object>> expected = new ArrayList<>();
assertTrue(r.containsAll(expected));
});
}

@Test
public void groupNyNameWithInvalidPattern1() {
QueryExecutionException e = assertThrows(
QueryExecutionException.class,
() -> testCall(
db,
"RETURN apoc.text.regexGroupsByName('asset_','(?<firstPart>\\w+)?\\_(?<firstPart>\\w+)?') AS result",
(r) -> {}));
Throwable except = ExceptionUtils.getRootCause(e);
assertTrue(except instanceof RuntimeException);
assertEquals(
"""
Invalid regex pattern: Named capturing group <firstPart> is already defined near index 32
(?<firstPart>\\w+)?\\_(?<firstPart>\\w+)?
^""",
except.getMessage());
}

@Test
public void groupNyNameWithInvalidPattern2() {
QueryExecutionException e = assertThrows(
QueryExecutionException.class,
() -> testCall(db, "RETURN apoc.text.regexGroupsByName('asset_','(?<firstPart') AS result", (r) -> {}));
Throwable except = ExceptionUtils.getRootCause(e);
assertTrue(except instanceof RuntimeException);
assertEquals(
"""
Invalid regex pattern: named capturing group is missing trailing '>' near index 12
(?<firstPart""",
except.getMessage());
}

@Test
public void groupNyNameWithNoGroupNames() {
testResult(db, "RETURN apoc.text.regexGroupsByName('asset_','(\\w+)?\\_(\\w+)?') AS result", result -> {
final List<Object> r = Iterators.single(result.columnAs("result"));

List<List<Object>> expected = new ArrayList<>();
assertTrue(r.containsAll(expected));
});
}

@Test
public void testRegexGroupsByNameForNPE() {
// throws no exception
testCall(
db,
"RETURN apoc.text.regexGroupsByName(null,'<link (?<firstPart>\\\\w+)>(?<secondPart>\\\\w+)</link>') AS result",
row -> {});
testCall(db, "RETURN apoc.text.regexGroupsByName('abc',null) AS result", row -> {});
}

@Test
public void testRegexGroupsForNPE() {
// throws no exception
Expand Down

0 comments on commit cbf76bb

Please sign in to comment.