Skip to content

Commit

Permalink
8305681: Allow additional characters for GB18030-2022 (Level 2) support
Browse files Browse the repository at this point in the history
Co-authored-by: Justin Lu <[email protected]>
Reviewed-by: lancea, iris, rriggs
Backport-of: b08cddec8625424b1292051088513a60606ef1e9
  • Loading branch information
naotoj and Justin Lu committed May 10, 2023
1 parent 9740c24 commit 3d9e8d0
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 18 deletions.
10 changes: 10 additions & 0 deletions jdk/make/data/characterdata/CharacterData02.java.template
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,21 @@ class CharacterData02 extends CharacterData {
}

boolean isJavaIdentifierStart(int ch) {
// isJavaIdentifierStart strictly conforms to code points assigned
// in Unicode 6.2.
if(Character.UnicodeBlock.of(ch) ==
Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E)
return false;
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);
}

boolean isJavaIdentifierPart(int ch) {
// isJavaIdentifierPart strictly conforms to code points assigned
// in Unicode 6.2.
if(Character.UnicodeBlock.of(ch) ==
Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E)
return false;
int props = getProperties(ch);
return ((props & $$nonzeroJavaPart) != 0);
}
Expand Down
2 changes: 2 additions & 0 deletions jdk/make/data/unicodedata/UnicodeData.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23550,6 +23550,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
2B820;<CJK Ideograph Extension E, First>;Lo;0;L;;;;;N;;;;;
2CEA1;<CJK Ideograph Extension E, Last>;Lo;0;L;;;;;N;;;;;
2F800;CJK COMPATIBILITY IDEOGRAPH-2F800;Lo;0;L;4E3D;;;;N;;;;;
2F801;CJK COMPATIBILITY IDEOGRAPH-2F801;Lo;0;L;4E38;;;;N;;;;;
2F802;CJK COMPATIBILITY IDEOGRAPH-2F802;Lo;0;L;4E41;;;;N;;;;;
Expand Down
23 changes: 19 additions & 4 deletions jdk/src/share/classes/java/lang/Character.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@
* block from version 10.0 of the Unicode Standard. Second, the Java SE 8 Platform
* allows an implementation of class {@code Character} to use the code points
* in the range of {@code U+9FCD} to {@code U+9FEF} from version 11.0 of the
* Unicode Standard, in order for the class to allow the "Implementation
* Level 1" of the Chinese GB18030-2022 standard. Third, the Java SE 8 Platform
* Unicode Standard and in the {@code CJK Unified Ideographs Extension E} block
* from version 8.0 of the Unicode Standard, in order for the class to allow the
* "Implementation Level 2" of the Chinese GB18030-2022 standard.
* Third, the Java SE 8 Platform
* allows an implementation of class {@code Character} to use the Japanese Era
* code point, {@code U+32FF}, from the Unicode Standard version 12.1.
* Consequently, the
Expand Down Expand Up @@ -2575,7 +2577,18 @@ private UnicodeBlock(String idName, String... aliases) {
"ARABIC MATHEMATICAL ALPHABETIC SYMBOLS",
"ARABICMATHEMATICALALPHABETICSYMBOLS");

private static final int[] blockStarts = {
/**
* Constant for the "CJK Unified Ideographs Extension E" Unicode
* character block.
* @apiNote This field is defined in Java SE 8 Maintenance Release 5.
* @since 1.8
*/
public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E =
new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E",
"CJK UNIFIED IDEOGRAPHS EXTENSION E",
"CJKUNIFIEDIDEOGRAPHSEXTENSIONE");

private static final int blockStarts[] = {
0x0000, // 0000..007F; Basic Latin
0x0080, // 0080..00FF; Latin-1 Supplement
0x0100, // 0100..017F; Latin Extended-A
Expand Down Expand Up @@ -2823,7 +2836,8 @@ private UnicodeBlock(String idName, String... aliases) {
0x2A6E0, // unassigned
0x2A700, // 2A700..2B73F; CJK Unified Ideographs Extension C
0x2B740, // 2B740..2B81F; CJK Unified Ideographs Extension D
0x2B820, // unassigned
0x2B820, // 2B820..2CEAF; CJK Unified Ideographs Extension E
0x2CEB0, // unassigned
0x2F800, // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
0x2FA20, // unassigned
0xE0000, // E0000..E007F; Tags
Expand Down Expand Up @@ -3082,6 +3096,7 @@ private UnicodeBlock(String idName, String... aliases) {
null,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
null,
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
null,
Expand Down
4 changes: 2 additions & 2 deletions jdk/test/java/lang/Character/CheckScript.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

/*
* Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2010, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand All @@ -24,7 +24,7 @@

/**
* @test
* @bug 6945564 6959267 7033561 7070436 7198195
* @bug 6945564 6959267 7033561 7070436 7198195 8305681
* @summary Check that the j.l.Character.UnicodeScript
*/

Expand Down
3 changes: 2 additions & 1 deletion jdk/test/java/lang/Character/Scripts.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1439,9 +1439,10 @@ FA70..FAD9 ; Han # Lo [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILI
20000..2A6D6 ; Han # Lo [42711] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6D6
2A700..2B734 ; Han # Lo [4149] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B734
2B740..2B81D ; Han # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
2B820..2CEA1 ; Han # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
2F800..2FA1D ; Han # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D

# Total code points: 75998
# Total code points: 81760

# ================================================

Expand Down
31 changes: 20 additions & 11 deletions jdk/test/java/lang/Character/TestIsJavaIdentifierMethods.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,26 @@
* @test
* @summary Test behavior of isJavaIdentifierXX, testIsJavaLetter, and
* testIsJavaLetterOrDigit methods for all code points.
* @bug 8218915
* @bug 8218915 8301400 8305681
*/

public class TestIsJavaIdentifierMethods {
// Unassigned code points not present in Unicode 6.2 (which Java SE 8
// is based upon), including: various currency symbol sign code points
// (Nordic Mark ... Bitcoin), Japanese Era Square character code point,
// and 35 CJK Unified Ideograph code points from GB18030-2022
// (Nordic Mark ... Bitcoin), the Japanese Era Square character code point, and
// code points for GB1030-2022 level 1 and 2 implementation including
// (35 code points from CJK Unified Ideographs and all of CJK Unified Ideographs
// Extension E).
private static final int CS_SIGNS_CODEPOINT_START = 0x20BB;
private static final int CS_SIGNS_CODEPOINT_END = 0x20BF;
private static final int JAPANESE_ERA_CODEPOINT = 0x32FF;
private static final int GB18030_2022_CODEPOINT_START = 0x9FCD;
private static final int GB18030_2022_CODEPOINT_END = 0x9FEF;
// GB18030_2022 Code Points
private static final int CJK_GB18030_LEVEL1_START = 0x9FCD;
private static final int CJK_GB18030_LEVEL1_END = 0x9FEF;
// Extension E code points are greater than U+FFFF,
// and thus only the int methods need to be tested
private static final int CJK_EXTENSION_E_START = 0x2B820;
private static final int CJK_EXTENSION_E_END = 0x2CEAF;

public static void main(String[] args) {
testIsJavaIdentifierPart_int();
Expand Down Expand Up @@ -77,7 +84,8 @@ public static void testIsJavaIdentifierPart_int() {
// value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT &&
!(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) &&
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
!(cp >= CJK_GB18030_LEVEL1_START && cp <= CJK_GB18030_LEVEL1_END) &&
!(cp >= CJK_EXTENSION_E_START && cp <= CJK_EXTENSION_E_END)) {
byte type = (byte) Character.getType(cp);
expected = Character.isLetter(cp)
|| type == Character.CURRENCY_SYMBOL
Expand Down Expand Up @@ -125,7 +133,7 @@ public static void testIsJavaIdentifierPart_char() {
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
!(i >= CJK_GB18030_LEVEL1_START && i <= CJK_GB18030_LEVEL1_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.CURRENCY_SYMBOL
Expand Down Expand Up @@ -168,7 +176,8 @@ public static void testIsJavaIdentifierStart_int() {
// value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT &&
!(cp >= CS_SIGNS_CODEPOINT_START && cp <= CS_SIGNS_CODEPOINT_END) &&
!(cp >= GB18030_2022_CODEPOINT_START && cp <= GB18030_2022_CODEPOINT_END)) {
!(cp >= CJK_GB18030_LEVEL1_START && cp <= CJK_GB18030_LEVEL1_END) &&
!(cp >= CJK_EXTENSION_E_START && cp <= CJK_EXTENSION_E_END)) {
byte type = (byte) Character.getType(cp);
expected = Character.isLetter(cp)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -208,7 +217,7 @@ public static void testIsJavaIdentifierStart_char() {
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
!(i >= CJK_GB18030_LEVEL1_START && i <= CJK_GB18030_LEVEL1_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -248,7 +257,7 @@ public static void testIsJavaLetter() {
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
!(i >= CJK_GB18030_LEVEL1_START && i <= CJK_GB18030_LEVEL1_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.LETTER_NUMBER
Expand Down Expand Up @@ -292,7 +301,7 @@ public static void testIsJavaLetterOrDigit() {
// value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT &&
!(i >= CS_SIGNS_CODEPOINT_START && i <= CS_SIGNS_CODEPOINT_END) &&
!(i >= GB18030_2022_CODEPOINT_START && i <= GB18030_2022_CODEPOINT_END)) {
!(i >= CJK_GB18030_LEVEL1_START && i <= CJK_GB18030_LEVEL1_END)) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.CURRENCY_SYMBOL
Expand Down

0 comments on commit 3d9e8d0

Please sign in to comment.