CollationTest.java
1 |
/**
|
---|---|
2 |
*
|
3 |
*/
|
4 |
package sandbox; |
5 |
|
6 |
import java.io.*; |
7 |
import java.nio.charset.*; |
8 |
import java.sql.*; |
9 |
import java.text.*; |
10 |
import java.util.*; |
11 |
|
12 |
/**
|
13 |
* A program which generates the first 255 characters associated with an input character set, converts them
|
14 |
* to the JVM's internal character set (i.e., UTF-16), and inserts them into a UTF8 database. From there,
|
15 |
* they are queried and output in the database's collation order.
|
16 |
* <p>
|
17 |
* Supported databases are PostgreSQL and H2.
|
18 |
* <p>
|
19 |
* Some prerequisite setup is needed for this program to work with PostgreSQL:
|
20 |
* <ol>
|
21 |
* <li>Use the default (typically {@code main}) cluster. Assumptions:
|
22 |
* <ul>
|
23 |
* <li>listens on default port 5432;
|
24 |
* <li>uses default UTF8 encoding and picks up its locale from a UTF8 OS locale (e.g., {@code en_US.UTF8};
|
25 |
* </ul>
|
26 |
* <li>connect to the cluster as a superuser (e.g., {@code postgres});
|
27 |
* <li>create a {@code fwd} role: {@code create role fwd with login password 'fwd'};
|
28 |
* <li>create a {@code fwd} database: {@code create database fwd}.
|
29 |
* <li>install {@code citext} data type support: {@code create extension citext} (while connected to the
|
30 |
* {@code fwd} database);
|
31 |
* </ol>
|
32 |
* <p>
|
33 |
* There are no specific prerequisites for H2; a transient, in-memory database is used.
|
34 |
*/
|
35 |
public class CollationTest |
36 |
{ |
37 |
/** Insert statement */
|
38 |
private static final String insert = |
39 |
"insert into tt1(codepoint_ori, encoded_cs, encoded_ci) values (?, ?, ?)";
|
40 |
|
41 |
/** Query for case-insensitive collation */
|
42 |
private static final String qci = |
43 |
"select codepoint_ori, ascii(encoded_ci), encoded_ci " +
|
44 |
"from tt1 " +
|
45 |
"order by encoded_ci, ascii(encoded_ci) %s";
|
46 |
|
47 |
/** Query for case-sensitive collation */
|
48 |
private static final String qcs = |
49 |
"select codepoint_ori, ascii(encoded_cs), encoded_cs " +
|
50 |
"from tt1 " +
|
51 |
"order by encoded_cs, ascii(encoded_cs) %s";
|
52 |
|
53 |
/** Result set output header text */
|
54 |
private static final String rsHdr = |
55 |
" codepoint src | codepoint tgt | hex codepoint | character ";
|
56 |
|
57 |
/** Result set output header separator */
|
58 |
private static final String rsSep = |
59 |
"---------------+---------------+---------------+-----------";
|
60 |
|
61 |
/** Result set row output spec */
|
62 |
private static final String rsRow = |
63 |
" %3d | %4d | %4X | %s ";
|
64 |
|
65 |
/** Name of original character set */
|
66 |
private static String originalEncoding = null; |
67 |
|
68 |
/** Listening port (PostgreSQL only) */
|
69 |
private static String port = "5432"; |
70 |
|
71 |
/** JDBC URL */
|
72 |
private static String jdbcURL = null; |
73 |
|
74 |
/** Collation */
|
75 |
private static String collation = null; |
76 |
|
77 |
/**
|
78 |
* Print program syntax with optional error message.
|
79 |
* <p>
|
80 |
* Expected syntax is:
|
81 |
* <pre>
|
82 |
* CollationTest <charset name> <database type>
|
83 |
*
|
84 |
* where...
|
85 |
* <charset name> is the name or alias of an available character set
|
86 |
* <database type> is 1 for PostgreSQL; 2 for H2
|
87 |
* <collation> (optional, only valid for H2) collation descriptor
|
88 |
* </pre>
|
89 |
*
|
90 |
* @param message
|
91 |
* Optional error message; may by {@code null}.
|
92 |
*/
|
93 |
private static void syntax(String message) |
94 |
{ |
95 |
if (message != null) |
96 |
{ |
97 |
System.err.println("ERROR: " + message); |
98 |
System.err.println();
|
99 |
} |
100 |
|
101 |
System.err.println("Syntax: java CollationTest <charset name> { 1 [<port>] | 2 [<collation>] }"); |
102 |
System.err.println();
|
103 |
System.err.println("where..."); |
104 |
System.err.println(" <charset name> is the name or alias of an available character set"); |
105 |
System.err.println(" 1 for PostgreSQL; 2 for H2"); |
106 |
System.err.println(" <port> (optional, only valid for PostgreSQL) listening port; defaults to 5432"); |
107 |
System.err.println(" <collation> (optional, only valid for H2) collation descriptor"); |
108 |
System.err.println();
|
109 |
|
110 |
System.err.println("Available charset names [aliases]:"); |
111 |
System.err.println("----------------------------------"); |
112 |
for (String name : Charset.availableCharsets().keySet()) |
113 |
{ |
114 |
System.err.print(name);
|
115 |
System.err.println(" " + Charset.forName(name).aliases()); |
116 |
} |
117 |
} |
118 |
|
119 |
/**
|
120 |
* Convert a short value to the smallest number of bytes, omitting the high order byte if it is zero.
|
121 |
*
|
122 |
* @param i
|
123 |
* An integer whose value uses no more than two bytes.
|
124 |
*
|
125 |
* @return Array of one or two bytes, representing the binary value of the given input value, high order
|
126 |
* byte first.
|
127 |
*/
|
128 |
private static byte[] shortToBytes(int i) |
129 |
{ |
130 |
byte high = (byte) (i >> 8); |
131 |
byte low = (byte) i; |
132 |
|
133 |
return high > 0 ? new byte[] { high, low } : new byte[] { low }; |
134 |
} |
135 |
|
136 |
/**
|
137 |
* Return an array of the first 255 encoded characters of the given character set, where each encoded
|
138 |
* character has been converted to a Unicode string.
|
139 |
*
|
140 |
* @param charset
|
141 |
* A valid charset name or alias.
|
142 |
*
|
143 |
* @return First 255 characters of the given character set, as Unicode strings.
|
144 |
*/
|
145 |
private static String[] first255Encoded(Charset charset) |
146 |
{ |
147 |
int len = 255; |
148 |
String[] encoded = new String[len]; |
149 |
|
150 |
for (int i = 1; i <= len; i++) |
151 |
{ |
152 |
byte[] ba = shortToBytes(i); |
153 |
String s = new String(ba, 0, ba.length, charset); |
154 |
encoded[i - 1] = s;
|
155 |
// System.out.println("codepoint: " + i + "; encoded string: '" + s + "'");
|
156 |
} |
157 |
|
158 |
return encoded;
|
159 |
} |
160 |
|
161 |
/**
|
162 |
* Perform the necessary setup for the PostgreSQL database.
|
163 |
*
|
164 |
* @param stmt
|
165 |
* SQL statement.
|
166 |
* @param newCollation
|
167 |
* Ignored; {@code lc_collate} setting must be set at cluster initialization and cannot be
|
168 |
* changed thereafter.
|
169 |
*
|
170 |
* @throws SQLException
|
171 |
* if an error is encountered executing a SQL statement.
|
172 |
*/
|
173 |
private static void setupPostgreSQL(Statement stmt, String newCollation) |
174 |
throws SQLException |
175 |
{ |
176 |
try (ResultSet rs = stmt.executeQuery("select datcollate from pg_database where datname = 'fwd'")) |
177 |
{ |
178 |
if (rs.next())
|
179 |
{ |
180 |
collation = rs.getString(1);
|
181 |
} |
182 |
} |
183 |
|
184 |
if (newCollation != null) |
185 |
{ |
186 |
System.err.println("Collation parameter '" + newCollation + "' cannot be set; parameter ignored"); |
187 |
} |
188 |
|
189 |
stmt.execute("create temporary table tt1 (codepoint_ori int, encoded_cs text, encoded_ci citext)");
|
190 |
} |
191 |
|
192 |
/**
|
193 |
* Perform the necessary setup for the H2 database.
|
194 |
*
|
195 |
* @param stmt
|
196 |
* SQL statement.
|
197 |
*
|
198 |
* @throws SQLException
|
199 |
* if an error is encountered executing a SQL statement.
|
200 |
*/
|
201 |
private static void setupH2(Statement stmt, String newCollation) |
202 |
throws SQLException |
203 |
{ |
204 |
if (newCollation != null) |
205 |
{ |
206 |
stmt.execute("set collation " + newCollation);
|
207 |
collation = newCollation; |
208 |
} |
209 |
stmt.execute("create temporary table tt1 " +
|
210 |
"(codepoint_ori int, encoded_cs varchar, encoded_ci varchar_ignorecase)");
|
211 |
} |
212 |
|
213 |
/**
|
214 |
* Generate collated query output to file. The file name is composed of the distinctive elements of the
|
215 |
* query:
|
216 |
* <ul>
|
217 |
* <li>the database type
|
218 |
* <li>the original character encoding
|
219 |
* <li>the case-sensitivity of the collation
|
220 |
* <li>today's date
|
221 |
* <li>the direction applied to the secondary sort by UTF codepoint (1 for asc, 2 for desc)
|
222 |
* </ul>
|
223 |
*
|
224 |
* @param stmt
|
225 |
* SQL statement
|
226 |
* @param caseSensitive
|
227 |
* {@code true} for case-sensitive collation, {@code false} for case-insensitive.
|
228 |
* @param ascending
|
229 |
* Direction of the secondary sort {@code true} for ascending, {@code false} for descending.
|
230 |
*
|
231 |
* @throws SQLException
|
232 |
* if an error occurs executing the query or fetching the results.
|
233 |
*/
|
234 |
private static void generateOutput(Statement stmt, boolean caseSensitive, boolean ascending) |
235 |
throws SQLException, |
236 |
IOException
|
237 |
{ |
238 |
int begin = jdbcURL.indexOf(':') + 1; |
239 |
int end = jdbcURL.indexOf(':', begin); |
240 |
String db = jdbcURL.substring(begin, end);
|
241 |
SimpleDateFormat format = new SimpleDateFormat("yyyyMMdd"); |
242 |
String date = format.format(Calendar.getInstance().getTime()); |
243 |
String enc = originalEncoding.toLowerCase();
|
244 |
String coll = collation == null ? "no_collation" : collation.replace(' ', '_'); |
245 |
int sort = ascending ? 1 : 2; |
246 |
|
247 |
String fileSpec = "%s_%s_to_utf_case-%ssensitive_%s_%s_%d.txt"; |
248 |
String file = String.format(fileSpec, db, enc, caseSensitive ? "" : "in", coll, date, sort); |
249 |
|
250 |
try (PrintStream ps = new PrintStream(new FileOutputStream(file))) |
251 |
{ |
252 |
ps.println("Database : " + jdbcURL);
|
253 |
ps.println("Case-sensitive : " + caseSensitive);
|
254 |
ps.println("Original Encoding: " + originalEncoding);
|
255 |
ps.println("Collation : " + collation);
|
256 |
ps.println(); |
257 |
String query = caseSensitive ? qcs : qci;
|
258 |
ResultSet rs = stmt.executeQuery(String.format(query, ascending ? "asc" : "desc")); |
259 |
ps.println(rsHdr); |
260 |
ps.println(rsSep); |
261 |
while (rs.next())
|
262 |
{ |
263 |
int cpori = rs.getInt(1); |
264 |
int cputf = rs.getInt(2); |
265 |
String ch = rs.getString(3); |
266 |
ps.println(String.format(rsRow, cpori, cputf, cputf, ch));
|
267 |
} |
268 |
} |
269 |
} |
270 |
|
271 |
/**
|
272 |
* Command line entry point; expects a character set name/alias and a database type to indicate which
|
273 |
* database to use.
|
274 |
*
|
275 |
* @see #syntax(String)
|
276 |
*/
|
277 |
public static void main(String[] args) |
278 |
{ |
279 |
int numArgs = args.length;
|
280 |
|
281 |
if (numArgs < 2) |
282 |
{ |
283 |
syntax("Incorrect argument(s)");
|
284 |
|
285 |
return;
|
286 |
} |
287 |
|
288 |
String coll = collation;
|
289 |
|
290 |
// required argument: charset name
|
291 |
originalEncoding = args[0];
|
292 |
|
293 |
SQLFunc setup = null;
|
294 |
|
295 |
try
|
296 |
{ |
297 |
switch (args[1]) |
298 |
{ |
299 |
case "1": |
300 |
if (numArgs == 3) |
301 |
{ |
302 |
port = args[2];
|
303 |
} |
304 |
jdbcURL = String.format("jdbc:postgresql://localhost:%s/fwd", port); |
305 |
setup = CollationTest::setupPostgreSQL; |
306 |
break;
|
307 |
case "2": |
308 |
if (numArgs == 3) |
309 |
{ |
310 |
coll = args[2];
|
311 |
} |
312 |
jdbcURL = "jdbc:h2:mem:fwd";
|
313 |
setup = CollationTest::setupH2; |
314 |
break;
|
315 |
default:
|
316 |
throw new IllegalArgumentException(); |
317 |
} |
318 |
} |
319 |
catch (Exception exc) |
320 |
{ |
321 |
syntax("Invalid database type");
|
322 |
|
323 |
return;
|
324 |
} |
325 |
|
326 |
Charset charset;
|
327 |
try
|
328 |
{ |
329 |
charset = Charset.forName(originalEncoding);
|
330 |
originalEncoding = charset.displayName(); |
331 |
} |
332 |
catch (IllegalArgumentException exc) |
333 |
{ |
334 |
syntax("Invalid charset name: " + exc.getMessage());
|
335 |
|
336 |
return;
|
337 |
} |
338 |
|
339 |
String[] encoded = first255Encoded(charset); |
340 |
int len = encoded.length;
|
341 |
|
342 |
try (Connection conn = DriverManager.getConnection(jdbcURL, "fwd", "fwd")) |
343 |
{ |
344 |
Statement stmt = conn.createStatement();
|
345 |
setup.execute(stmt, coll); |
346 |
|
347 |
PreparedStatement ps = conn.prepareStatement(insert);
|
348 |
for (int i = 1; i <= len; i++) |
349 |
{ |
350 |
String s = encoded[i - 1]; |
351 |
ps.setInt(1, i);
|
352 |
ps.setString(2, s);
|
353 |
ps.setString(3, s);
|
354 |
ps.addBatch(); |
355 |
} |
356 |
|
357 |
ps.executeBatch(); |
358 |
ps.close(); |
359 |
|
360 |
// case-sensitive sort
|
361 |
generateOutput(stmt, true, true); |
362 |
generateOutput(stmt, true, false); |
363 |
|
364 |
// case-insensitive sort
|
365 |
generateOutput(stmt, false, true); |
366 |
generateOutput(stmt, false, false); |
367 |
} |
368 |
catch (SQLException | IOException exc) |
369 |
{ |
370 |
exc.printStackTrace(); |
371 |
} |
372 |
} |
373 |
|
374 |
/**
|
375 |
* Functional interface which accepts a SQL statement and possibly throws {@code SQLException}.
|
376 |
*/
|
377 |
private static interface SQLFunc |
378 |
{ |
379 |
void execute(Statement stmt, String newCollation) |
380 |
throws SQLException; |
381 |
} |
382 |
} |