From 83ab24252b3a5d784a2651852636c1c608480d5e Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Wed, 30 Jul 2025 15:05:44 +0200 Subject: [PATCH 1/3] fix tr/// interpolation WIP --- .../operators/RuntimeTransliterate.java | 137 +++++++++--------- .../org/perlonjava/parser/StringParser.java | 6 +- 2 files changed, 68 insertions(+), 75 deletions(-) diff --git a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java index cff15045b..05cae4b9a 100644 --- a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java +++ b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java @@ -38,7 +38,8 @@ public static RuntimeTransliterate compile(RuntimeScalar search, RuntimeScalar r * Applies the transliteration pattern to the given string. * * @param originalString The original string to be transliterated - * @return A new RuntimeScalar containing the transliterated string + * @param ctx The runtime context + * @return A new RuntimeScalar containing the transliterated string or count */ public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) { String input = originalString.toString(); @@ -48,7 +49,7 @@ public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) { for (int i = 0; i < input.length(); i++) { char ch = input.charAt(i); - if (deleteChars[ch]) { + if (ch < 256 && deleteChars[ch]) { lastCharAdded = false; count++; // Count deleted characters } else if (ch < 256 && usedChars[ch]) { @@ -92,8 +93,9 @@ public void compileTransliteration(String search, String replace, String modifie squashDuplicates = modifiers.contains("s"); returnOriginal = modifiers.contains("r"); - String expandedSearch = expandRangesAndEscapes(search); - String expandedReplace = expandRangesAndEscapes(replace); + // Parse escape sequences first, then expand ranges + String expandedSearch = expandRanges(search); + String expandedReplace = expandRanges(replace); translationMap = new char[256]; usedChars = new boolean[256]; @@ -102,6 +104,7 @@ public void compileTransliteration(String search, String replace, String modifie for (int i = 0; i < 256; i++) { translationMap[i] = (char) i; usedChars[i] = false; + deleteChars[i] = false; } if (complement) { @@ -112,70 +115,34 @@ public void compileTransliteration(String search, String replace, String modifie } /** - * Expands character ranges and escape sequences in the input string. + * Expands character ranges like a-z, A-Z, 0-9. * - * @param input The input string containing ranges and escapes - * @return The expanded string + * @param input The input string possibly containing character ranges + * @return The string with ranges expanded */ - private String expandRangesAndEscapes(String input) { + private String expandRanges(String input) { StringBuilder expanded = new StringBuilder(); + for (int i = 0; i < input.length(); i++) { - char ch = input.charAt(i); - if (i + 2 < input.length() && input.charAt(i + 1) == '-') { - char start = ch; - char end = input.charAt(i + 2); - i += 2; - if (start <= end) { - for (char c = start; c <= end; c++) { - expanded.append(c); + if (i > 0 && i < input.length() - 1 && input.charAt(i) == '-') { + char start = input.charAt(i - 1); + char end = input.charAt(i + 1); + + // Check if this is a valid range + if (start < end) { + // We already added start, so begin from start + 1 + for (char ch = (char)(start + 1); ch <= end; ch++) { + expanded.append(ch); } - } else { - for (char c = start; c >= end; c--) { - expanded.append(c); - } - } - } else if (ch == '\\' && i + 1 < input.length()) { - char next = input.charAt(i + 1); - switch (next) { - case 'n': - expanded.append('\n'); - break; - case 't': - expanded.append('\t'); - break; - case 'r': - expanded.append('\r'); - break; - case 'f': - expanded.append('\f'); - break; - case 'x': - if (i + 3 < input.length() && isHexDigit(input.charAt(i + 2)) && isHexDigit(input.charAt(i + 3))) { - int hexValue = Integer.parseInt(input.substring(i + 2, i + 4), 16); - expanded.append((char) hexValue); - i += 3; - } - break; - default: - expanded.append(next); - break; + i++; // Skip the end character as we've already processed it + continue; } - i++; - } else { - expanded.append(ch); } + + expanded.append(input.charAt(i)); } - return expanded.toString(); - } - /** - * Checks if a character is a hexadecimal digit. - * - * @param ch The character to check - * @return True if the character is a hexadecimal digit, false otherwise - */ - private boolean isHexDigit(char ch) { - return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); + return expanded.toString(); } /** @@ -189,7 +156,10 @@ private boolean isHexDigit(char ch) { private void complementTranslationMap(char[] translationMap, boolean[] usedChars, String search, String replace) { boolean[] complementSet = new boolean[256]; for (int i = 0; i < search.length(); i++) { - complementSet[search.charAt(i)] = true; + char ch = search.charAt(i); + if (ch < 256) { + complementSet[ch] = true; + } } int replaceIndex = 0; @@ -202,6 +172,7 @@ private void complementTranslationMap(char[] translationMap, boolean[] usedChars } else { if (deleteUnmatched) { deleteChars[i] = true; + usedChars[i] = true; } else if (!replace.isEmpty()) { translationMap[i] = replace.charAt(replace.length() - 1); usedChars[i] = true; @@ -221,22 +192,46 @@ private void complementTranslationMap(char[] translationMap, boolean[] usedChars */ private void populateTranslationMap(char[] translationMap, boolean[] usedChars, String search, String replace) { int minLength = Math.min(search.length(), replace.length()); + + // First pass: map characters that have replacements for (int i = 0; i < minLength; i++) { - translationMap[search.charAt(i)] = replace.charAt(i); - usedChars[search.charAt(i)] = true; + char searchChar = search.charAt(i); + if (searchChar < 256) { + // Only map if not already mapped (first occurrence wins) + if (!usedChars[searchChar]) { + translationMap[searchChar] = replace.charAt(i); + usedChars[searchChar] = true; + } + } } + // Second pass: handle remaining characters in search string for (int i = minLength; i < search.length(); i++) { - if (deleteUnmatched) { - deleteChars[search.charAt(i)] = true; - } else if (!replace.isEmpty()) { - translationMap[search.charAt(i)] = replace.charAt(replace.length() - 1); - usedChars[search.charAt(i)] = true; - } else { - // Empty replacement - map to self - translationMap[search.charAt(i)] = search.charAt(i); - usedChars[search.charAt(i)] = true; + char searchChar = search.charAt(i); + if (searchChar < 256 && !usedChars[searchChar]) { + if (deleteUnmatched) { + deleteChars[searchChar] = true; + usedChars[searchChar] = true; + } else if (!replace.isEmpty()) { + // Map to the last character in replace string + translationMap[searchChar] = replace.charAt(replace.length() - 1); + usedChars[searchChar] = true; + } else { + // Empty replacement - map to self + translationMap[searchChar] = searchChar; + usedChars[searchChar] = true; + } } } } + + /** + * Checks if a character is a hexadecimal digit. + * + * @param ch The character to check + * @return True if the character is a hexadecimal digit, false otherwise + */ + private boolean isHexDigit(char ch) { + return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); + } } diff --git a/src/main/java/org/perlonjava/parser/StringParser.java b/src/main/java/org/perlonjava/parser/StringParser.java index e61343ab2..43509b771 100644 --- a/src/main/java/org/perlonjava/parser/StringParser.java +++ b/src/main/java/org/perlonjava/parser/StringParser.java @@ -275,8 +275,7 @@ public static OperatorNode parseTransliteration(EmitterContext ctx, ParsedString rawStr.endDelim, ' ', ' ' ); - // searchNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, searchParsed, true, false); - searchNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, searchParsed, false, false); + searchNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, searchParsed, true, false); } // Same logic for replacement list @@ -292,8 +291,7 @@ public static OperatorNode parseTransliteration(EmitterContext ctx, ParsedString rawStr.secondBufferEndDelim, ' ', ' ' ); - // replacementNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, replaceParsed, true, false); - replacementNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, replaceParsed, false, false); + replacementNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, replaceParsed, true, false); } Node modifierNode = new StringNode(modifiers, rawStr.index); From 719a1710b2f8797ab686ac188b0f96268154756d Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Wed, 30 Jul 2025 15:28:36 +0200 Subject: [PATCH 2/3] fix tr/// WIP --- .../operators/RuntimeTransliterate.java | 61 +++++++++++++++++-- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java index 05cae4b9a..70606b26f 100644 --- a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java +++ b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java @@ -14,6 +14,7 @@ public class RuntimeTransliterate { private char[] translationMap; private boolean[] usedChars; private boolean[] deleteChars; + private boolean[] inSearchSet; // Track which chars are in the original search set private boolean complement; private boolean deleteUnmatched; private boolean squashDuplicates; @@ -44,25 +45,44 @@ public static RuntimeTransliterate compile(RuntimeScalar search, RuntimeScalar r public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) { String input = originalString.toString(); StringBuilder result = new StringBuilder(); - boolean lastCharAdded = false; + char lastChar = '\0'; + boolean lastCharWasFromComplement = false; int count = 0; // Track count of transliterated characters for (int i = 0; i < input.length(); i++) { char ch = input.charAt(i); if (ch < 256 && deleteChars[ch]) { - lastCharAdded = false; count++; // Count deleted characters + lastChar = '\0'; + lastCharWasFromComplement = false; } else if (ch < 256 && usedChars[ch]) { char mappedChar = translationMap[ch]; - if (!squashDuplicates || result.length() == 0 || result.charAt(result.length() - 1) != mappedChar) { + boolean isFromComplement = complement && !inSearchSet[ch]; + + // Apply squashing logic + boolean shouldSquash = false; + if (squashDuplicates && result.length() > 0 && lastChar == mappedChar) { + // In complement mode, only squash if both current and last char are from complement + if (complement) { + shouldSquash = isFromComplement && lastCharWasFromComplement; + } else { + // In normal mode, squash all duplicates + shouldSquash = true; + } + } + + if (!shouldSquash) { result.append(mappedChar); - lastCharAdded = true; } + // Always count characters that match the search pattern count++; + lastChar = mappedChar; + lastCharWasFromComplement = isFromComplement; } else { result.append(ch); - lastCharAdded = false; + lastChar = ch; + lastCharWasFromComplement = false; } } @@ -100,11 +120,21 @@ public void compileTransliteration(String search, String replace, String modifie translationMap = new char[256]; usedChars = new boolean[256]; deleteChars = new boolean[256]; + inSearchSet = new boolean[256]; for (int i = 0; i < 256; i++) { translationMap[i] = (char) i; usedChars[i] = false; deleteChars[i] = false; + inSearchSet[i] = false; + } + + // Mark characters in the search set + for (int i = 0; i < expandedSearch.length(); i++) { + char ch = expandedSearch.charAt(i); + if (ch < 256) { + inSearchSet[ch] = true; + } } if (complement) { @@ -162,6 +192,25 @@ private void complementTranslationMap(char[] translationMap, boolean[] usedChars } } + // Special case: complement with empty replacement + if (replace.isEmpty()) { + // For each character NOT in the search set (i.e., in the complement) + for (int i = 0; i < 256; i++) { + if (!complementSet[i]) { + if (deleteUnmatched) { + // With 'd' modifier, delete complement characters + deleteChars[i] = true; + usedChars[i] = true; + } else { + // Without 'd' modifier, map to themselves (for squashing) + usedChars[i] = true; + translationMap[i] = (char) i; + } + } + } + return; + } + int replaceIndex = 0; for (int i = 0; i < 256; i++) { if (!complementSet[i]) { @@ -173,7 +222,7 @@ private void complementTranslationMap(char[] translationMap, boolean[] usedChars if (deleteUnmatched) { deleteChars[i] = true; usedChars[i] = true; - } else if (!replace.isEmpty()) { + } else { translationMap[i] = replace.charAt(replace.length() - 1); usedChars[i] = true; } From 205c6512a4fdb8f08a832dd98baa45e9771319a6 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Wed, 30 Jul 2025 16:09:28 +0200 Subject: [PATCH 3/3] fix tr/// WIP --- .../operators/RuntimeTransliterate.java | 164 ++++++++---------- 1 file changed, 75 insertions(+), 89 deletions(-) diff --git a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java index 70606b26f..ac00442b6 100644 --- a/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java +++ b/src/main/java/org/perlonjava/operators/RuntimeTransliterate.java @@ -1,7 +1,11 @@ package org.perlonjava.operators; -import org.perlonjava.runtime.RuntimeContextType; +import org.perlonjava.runtime.PerlCompilerException; import org.perlonjava.runtime.RuntimeScalar; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; /** * The RuntimeTransliterate class implements Perl's tr/// operator, which is used for character @@ -10,11 +14,11 @@ */ public class RuntimeTransliterate { - // Arrays and flags used for transliteration - private char[] translationMap; - private boolean[] usedChars; - private boolean[] deleteChars; - private boolean[] inSearchSet; // Track which chars are in the original search set + // Maps and sets used for transliteration (now supports full Unicode) + private Map translationMap; + private Set usedChars; + private Set deleteChars; + private Set inSearchSet; // Track which chars are in the original search set private boolean complement; private boolean deleteUnmatched; private boolean squashDuplicates; @@ -39,7 +43,7 @@ public static RuntimeTransliterate compile(RuntimeScalar search, RuntimeScalar r * Applies the transliteration pattern to the given string. * * @param originalString The original string to be transliterated - * @param ctx The runtime context + * @param ctx The runtime context * @return A new RuntimeScalar containing the transliterated string or count */ public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) { @@ -51,13 +55,14 @@ public RuntimeScalar transliterate(RuntimeScalar originalString, int ctx) { for (int i = 0; i < input.length(); i++) { char ch = input.charAt(i); - if (ch < 256 && deleteChars[ch]) { + + if (deleteChars.contains(ch)) { count++; // Count deleted characters lastChar = '\0'; lastCharWasFromComplement = false; - } else if (ch < 256 && usedChars[ch]) { - char mappedChar = translationMap[ch]; - boolean isFromComplement = complement && !inSearchSet[ch]; + } else if (usedChars.contains(ch)) { + char mappedChar = translationMap.getOrDefault(ch, ch); + boolean isFromComplement = complement && !inSearchSet.contains(ch); // Apply squashing logic boolean shouldSquash = false; @@ -117,30 +122,21 @@ public void compileTransliteration(String search, String replace, String modifie String expandedSearch = expandRanges(search); String expandedReplace = expandRanges(replace); - translationMap = new char[256]; - usedChars = new boolean[256]; - deleteChars = new boolean[256]; - inSearchSet = new boolean[256]; - - for (int i = 0; i < 256; i++) { - translationMap[i] = (char) i; - usedChars[i] = false; - deleteChars[i] = false; - inSearchSet[i] = false; - } + translationMap = new HashMap<>(); + usedChars = new HashSet<>(); + deleteChars = new HashSet<>(); + inSearchSet = new HashSet<>(); // Mark characters in the search set for (int i = 0; i < expandedSearch.length(); i++) { char ch = expandedSearch.charAt(i); - if (ch < 256) { - inSearchSet[ch] = true; - } + inSearchSet.add(ch); } if (complement) { - complementTranslationMap(translationMap, usedChars, expandedSearch, expandedReplace); + complementTranslationMap(expandedSearch, expandedReplace); } else { - populateTranslationMap(translationMap, usedChars, expandedSearch, expandedReplace); + populateTranslationMap(expandedSearch, expandedReplace); } } @@ -161,12 +157,20 @@ private String expandRanges(String input) { // Check if this is a valid range if (start < end) { // We already added start, so begin from start + 1 - for (char ch = (char)(start + 1); ch <= end; ch++) { + for (char ch = (char) (start + 1); ch <= end; ch++) { expanded.append(ch); } i++; // Skip the end character as we've already processed it continue; + } else if (start > end) { + // Invalid range - throw exception + String startHex = String.format("\\x{%04X}", (int) start); + String endHex = String.format("\\x{%04X}", (int) end); + throw new PerlCompilerException( + "Invalid range \"" + startHex + "-" + endHex + "\" in transliteration operator" + ); } + // If start == end, fall through and treat as literal characters } expanded.append(input.charAt(i)); @@ -178,53 +182,49 @@ private String expandRanges(String input) { /** * Complements the translation map based on the search and replace strings. * - * @param translationMap The translation map to populate - * @param usedChars The array indicating which characters are used * @param search The search string * @param replace The replacement string */ - private void complementTranslationMap(char[] translationMap, boolean[] usedChars, String search, String replace) { - boolean[] complementSet = new boolean[256]; + private void complementTranslationMap(String search, String replace) { + Set searchSet = new HashSet<>(); for (int i = 0; i < search.length(); i++) { - char ch = search.charAt(i); - if (ch < 256) { - complementSet[ch] = true; - } + searchSet.add(search.charAt(i)); } - // Special case: complement with empty replacement - if (replace.isEmpty()) { - // For each character NOT in the search set (i.e., in the complement) - for (int i = 0; i < 256; i++) { - if (!complementSet[i]) { + // We need to iterate through all characters that might appear in the input + // For now, we'll handle the common case of characters up to U+FFFF + int replaceIndex = 0; + + for (int codePoint = 0; codePoint <= 0xFFFF; codePoint++) { + char ch = (char) codePoint; + + if (!searchSet.contains(ch)) { + // This character is in the complement set + if (replace.isEmpty()) { + // Special case: complement with empty replacement if (deleteUnmatched) { // With 'd' modifier, delete complement characters - deleteChars[i] = true; - usedChars[i] = true; + deleteChars.add(ch); + usedChars.add(ch); } else { // Without 'd' modifier, map to themselves (for squashing) - usedChars[i] = true; - translationMap[i] = (char) i; + usedChars.add(ch); + translationMap.put(ch, ch); } - } - } - return; - } - - int replaceIndex = 0; - for (int i = 0; i < 256; i++) { - if (!complementSet[i]) { - if (replaceIndex < replace.length()) { - translationMap[i] = replace.charAt(replaceIndex); - usedChars[i] = true; - replaceIndex++; } else { - if (deleteUnmatched) { - deleteChars[i] = true; - usedChars[i] = true; + // Map to replacement characters + if (replaceIndex < replace.length()) { + translationMap.put(ch, replace.charAt(replaceIndex)); + usedChars.add(ch); + replaceIndex++; } else { - translationMap[i] = replace.charAt(replace.length() - 1); - usedChars[i] = true; + if (deleteUnmatched) { + deleteChars.add(ch); + usedChars.add(ch); + } else { + translationMap.put(ch, replace.charAt(replace.length() - 1)); + usedChars.add(ch); + } } } } @@ -234,53 +234,39 @@ private void complementTranslationMap(char[] translationMap, boolean[] usedChars /** * Populates the translation map based on the search and replace strings. * - * @param translationMap The translation map to populate - * @param usedChars The array indicating which characters are used * @param search The search string * @param replace The replacement string */ - private void populateTranslationMap(char[] translationMap, boolean[] usedChars, String search, String replace) { + private void populateTranslationMap(String search, String replace) { int minLength = Math.min(search.length(), replace.length()); // First pass: map characters that have replacements for (int i = 0; i < minLength; i++) { char searchChar = search.charAt(i); - if (searchChar < 256) { - // Only map if not already mapped (first occurrence wins) - if (!usedChars[searchChar]) { - translationMap[searchChar] = replace.charAt(i); - usedChars[searchChar] = true; - } + // Only map if not already mapped (first occurrence wins) + if (!usedChars.contains(searchChar)) { + translationMap.put(searchChar, replace.charAt(i)); + usedChars.add(searchChar); } } // Second pass: handle remaining characters in search string for (int i = minLength; i < search.length(); i++) { char searchChar = search.charAt(i); - if (searchChar < 256 && !usedChars[searchChar]) { + if (!usedChars.contains(searchChar)) { if (deleteUnmatched) { - deleteChars[searchChar] = true; - usedChars[searchChar] = true; + deleteChars.add(searchChar); + usedChars.add(searchChar); } else if (!replace.isEmpty()) { // Map to the last character in replace string - translationMap[searchChar] = replace.charAt(replace.length() - 1); - usedChars[searchChar] = true; + translationMap.put(searchChar, replace.charAt(replace.length() - 1)); + usedChars.add(searchChar); } else { // Empty replacement - map to self - translationMap[searchChar] = searchChar; - usedChars[searchChar] = true; + translationMap.put(searchChar, searchChar); + usedChars.add(searchChar); } } } } - - /** - * Checks if a character is a hexadecimal digit. - * - * @param ch The character to check - * @return True if the character is a hexadecimal digit, false otherwise - */ - private boolean isHexDigit(char ch) { - return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'); - } -} +} \ No newline at end of file