Project

General

Profile

CollationTest.java

Eric Faulhaber, 07/01/2021 02:57 AM

Download (12.1 KB)

 
1
/**
2
 * 
3
 */
4
package sandbox;
5

    
6
import java.io.*;
7
import java.nio.charset.*;
8
import java.sql.*;
9
import java.text.*;
10
import java.util.*;
11

    
12
/**
13
 * A program which generates the first 255 characters associated with an input character set, converts them
14
 * to the JVM's internal character set (i.e., UTF-16), and inserts them into a UTF8 database. From there,
15
 * they are queried and output in the database's collation order.
16
 * <p>
17
 * Supported databases are PostgreSQL and H2.
18
 * <p>
19
 * Some prerequisite setup is needed for this program to work with PostgreSQL:
20
 * <ol>
21
 * <li>Use the default (typically {@code main}) cluster. Assumptions:
22
 * <ul>
23
 * <li>listens on default port 5432;
24
 * <li>uses default UTF8 encoding and picks up its locale from a UTF8 OS locale (e.g., {@code en_US.UTF8};
25
 * </ul>
26
 * <li>connect to the cluster as a superuser (e.g., {@code postgres});
27
 * <li>create a {@code fwd} role: {@code create role fwd with login password 'fwd'};
28
 * <li>create a {@code fwd} database: {@code create database fwd}.
29
 * <li>install {@code citext} data type support: {@code create extension citext} (while connected to the
30
 *     {@code fwd} database);
31
 * </ol>
32
 * <p>
33
 * There are no specific prerequisites for H2; a transient, in-memory database is used.
34
 */
35
public class CollationTest
36
{
37
   /** Insert statement */
38
   private static final String insert =
39
      "insert into tt1(codepoint_ori, encoded_cs, encoded_ci) values (?, ?, ?)";
40
   
41
   /** Query for case-insensitive collation */
42
   private static final String qci =
43
      "select codepoint_ori, ascii(encoded_ci), encoded_ci " +
44
      "from tt1 " +
45
      "order by encoded_ci, ascii(encoded_ci) %s";
46
   
47
   /** Query for case-sensitive collation */
48
   private static final String qcs =
49
      "select codepoint_ori, ascii(encoded_cs), encoded_cs " +
50
      "from tt1 " +
51
      "order by encoded_cs, ascii(encoded_cs) %s";
52
   
53
   /** Result set output header text */
54
   private static final String rsHdr =
55
      " codepoint src | codepoint tgt | hex codepoint | character ";
56
   
57
   /** Result set output header separator */
58
   private static final String rsSep =
59
      "---------------+---------------+---------------+-----------";
60
   
61
   /** Result set row output spec */
62
   private static final String rsRow =
63
      "           %3d |          %4d |          %4X | %s         ";
64
   
65
   /** Name of original character set */
66
   private static String originalEncoding = null;
67
   
68
   /** Listening port (PostgreSQL only) */
69
   private static String port = "5432";
70
   
71
   /** JDBC URL */
72
   private static String jdbcURL = null;
73
   
74
   /** Collation */
75
   private static String collation = null;
76
   
77
   /**
78
    * Print program syntax with optional error message.
79
    * <p>
80
    * Expected syntax is:
81
    * <pre>
82
    * CollationTest <charset name> <database type>
83
    * 
84
    * where...
85
    *    <charset name> is the name or alias of an available character set
86
    *    <database type> is 1 for PostgreSQL; 2 for H2
87
    *    <collation> (optional, only valid for H2) collation descriptor
88
    * </pre>
89
    * 
90
    * @param   message
91
    *          Optional error message; may by {@code null}.
92
    */
93
   private static void syntax(String message)
94
   {
95
      if (message != null)
96
      {
97
         System.err.println("ERROR: " + message);
98
         System.err.println();
99
      }
100
      
101
      System.err.println("Syntax: java CollationTest <charset name> { 1 [<port>] | 2 [<collation>] }");
102
      System.err.println();
103
      System.err.println("where...");
104
      System.err.println("   <charset name> is the name or alias of an available character set");
105
      System.err.println("   1 for PostgreSQL; 2 for H2");
106
      System.err.println("   <port> (optional, only valid for PostgreSQL) listening port; defaults to 5432");
107
      System.err.println("   <collation> (optional, only valid for H2) collation descriptor");
108
      System.err.println();
109
      
110
      System.err.println("Available charset names [aliases]:");
111
      System.err.println("----------------------------------");
112
      for (String name : Charset.availableCharsets().keySet())
113
      {
114
         System.err.print(name);
115
         System.err.println(" " + Charset.forName(name).aliases());
116
      }
117
   }
118
   
119
   /**
120
    * Convert a short value to the smallest number of bytes, omitting the high order byte if it is zero.
121
    * 
122
    * @param   i
123
    *          An integer whose value uses no more than two bytes.
124
    * 
125
    * @return  Array of one or two bytes, representing the binary value of the given input value, high order
126
    *          byte first.
127
    */
128
   private static byte[] shortToBytes(int i)
129
   {
130
      byte high = (byte) (i >> 8);
131
      byte low = (byte) i;
132
      
133
      return high > 0 ? new byte[] { high, low } : new byte[] { low };
134
   }
135
   
136
   /**
137
    * Return an array of the first 255 encoded characters of the given character set, where each encoded
138
    * character has been converted to a Unicode string.
139
    * 
140
    * @param   charset
141
    *          A valid charset name or alias.
142
    * 
143
    * @return  First 255 characters of the given character set, as Unicode strings.
144
    */
145
   private static String[] first255Encoded(Charset charset)
146
   {
147
      int len = 255;
148
      String[] encoded = new String[len];
149
      
150
      for (int i = 1; i <= len; i++)
151
      {
152
         byte[] ba = shortToBytes(i);
153
         String s = new String(ba, 0, ba.length, charset);
154
         encoded[i - 1] = s;
155
//         System.out.println("codepoint: " + i + "; encoded string: '" + s + "'");
156
      }
157
      
158
      return encoded;
159
   }
160
   
161
   /**
162
    * Perform the necessary setup for the PostgreSQL database.
163
    * 
164
    * @param   stmt
165
    *          SQL statement.
166
    * @param   newCollation
167
    *          Ignored; {@code lc_collate} setting must be set at cluster initialization and cannot be
168
    *          changed thereafter.
169
    * 
170
    * @throws  SQLException
171
    *          if an error is encountered executing a SQL statement.
172
    */
173
   private static void setupPostgreSQL(Statement stmt, String newCollation)
174
   throws SQLException
175
   {
176
      try (ResultSet rs = stmt.executeQuery("select datcollate from pg_database where datname = 'fwd'"))
177
      {
178
         if (rs.next())
179
         {
180
            collation = rs.getString(1);
181
         }
182
      }
183
      
184
      if (newCollation != null)
185
      {
186
         System.err.println("Collation parameter '" + newCollation + "' cannot be set; parameter ignored");
187
      }
188
      
189
      stmt.execute("create temporary table tt1 (codepoint_ori int, encoded_cs text, encoded_ci citext)");
190
   }
191
   
192
   /**
193
    * Perform the necessary setup for the H2 database.
194
    * 
195
    * @param   stmt
196
    *          SQL statement.
197
    * 
198
    * @throws  SQLException
199
    *          if an error is encountered executing a SQL statement.
200
    */
201
   private static void setupH2(Statement stmt, String newCollation)
202
   throws SQLException
203
   {
204
      if (newCollation != null)
205
      {
206
         stmt.execute("set collation " + newCollation);
207
         collation = newCollation;
208
      }
209
      stmt.execute("create temporary table tt1 " +
210
                   "(codepoint_ori int, encoded_cs varchar, encoded_ci varchar_ignorecase)");
211
   }
212
   
213
   /**
214
    * Generate collated query output to file. The file name is composed of the distinctive elements of the
215
    * query:
216
    * <ul>
217
    * <li>the database type
218
    * <li>the original character encoding
219
    * <li>the case-sensitivity of the collation
220
    * <li>today's date
221
    * <li>the direction applied to the secondary sort by UTF codepoint (1 for asc, 2 for desc)
222
    * </ul>
223
    * 
224
    * @param   stmt
225
    *          SQL statement
226
    * @param   caseSensitive
227
    *          {@code true} for case-sensitive collation, {@code false} for case-insensitive.
228
    * @param   ascending
229
    *          Direction of the secondary sort {@code true} for ascending, {@code false} for descending.
230
    * 
231
    * @throws  SQLException
232
    *          if an error occurs executing the query or fetching the results.
233
    */
234
   private static void generateOutput(Statement stmt, boolean caseSensitive, boolean ascending)
235
   throws SQLException,
236
          IOException
237
   {
238
      int begin = jdbcURL.indexOf(':') + 1;
239
      int end = jdbcURL.indexOf(':', begin);
240
      String db = jdbcURL.substring(begin, end);
241
      SimpleDateFormat format = new SimpleDateFormat("yyyyMMdd");
242
      String date = format.format(Calendar.getInstance().getTime());
243
      String enc = originalEncoding.toLowerCase();
244
      String coll = collation == null ? "no_collation" : collation.replace(' ', '_');
245
      int sort = ascending ? 1 : 2;
246
      
247
      String fileSpec = "%s_%s_to_utf_case-%ssensitive_%s_%s_%d.txt";
248
      String file = String.format(fileSpec, db, enc, caseSensitive ? "" : "in", coll, date, sort);
249
      
250
      try (PrintStream ps = new PrintStream(new FileOutputStream(file)))
251
      {
252
         ps.println("Database         : " + jdbcURL);
253
         ps.println("Case-sensitive   : " + caseSensitive);
254
         ps.println("Original Encoding: " + originalEncoding);
255
         ps.println("Collation        : " + collation);
256
         ps.println();
257
         String query = caseSensitive ? qcs : qci;
258
         ResultSet rs = stmt.executeQuery(String.format(query, ascending ? "asc" : "desc"));
259
         ps.println(rsHdr);
260
         ps.println(rsSep);
261
         while (rs.next())
262
         {
263
            int cpori = rs.getInt(1);
264
            int cputf = rs.getInt(2);
265
            String ch = rs.getString(3);
266
            ps.println(String.format(rsRow, cpori, cputf, cputf, ch));
267
         }
268
      }
269
   }
270
   
271
   /**
272
    * Command line entry point; expects a character set name/alias and a database type to indicate which
273
    * database to use.
274
    * 
275
    * @see #syntax(String)
276
    */
277
   public static void main(String[] args)
278
   {
279
      int numArgs = args.length;
280
      
281
      if (numArgs < 2)
282
      {
283
         syntax("Incorrect argument(s)");
284
         
285
         return;
286
      }
287
      
288
      String coll = collation;
289
      
290
      // required argument: charset name
291
      originalEncoding = args[0];
292
      
293
      SQLFunc setup = null;
294
      
295
      try
296
      {
297
         switch (args[1])
298
         {
299
            case "1":
300
               if (numArgs == 3)
301
               {
302
                  port = args[2];
303
               }
304
               jdbcURL = String.format("jdbc:postgresql://localhost:%s/fwd", port);
305
               setup = CollationTest::setupPostgreSQL;
306
               break;
307
            case "2":
308
               if (numArgs == 3)
309
               {
310
                  coll = args[2];
311
               }
312
               jdbcURL = "jdbc:h2:mem:fwd";
313
               setup = CollationTest::setupH2;
314
               break;
315
            default:
316
               throw new IllegalArgumentException();
317
         }
318
      }
319
      catch (Exception exc)
320
      {
321
         syntax("Invalid database type");
322
         
323
         return;
324
      }
325
      
326
      Charset charset;
327
      try
328
      {
329
         charset = Charset.forName(originalEncoding);
330
         originalEncoding = charset.displayName();
331
      }
332
      catch (IllegalArgumentException exc)
333
      {
334
         syntax("Invalid charset name: " + exc.getMessage());
335
         
336
         return;
337
      }
338
      
339
      String[] encoded = first255Encoded(charset);
340
      int len = encoded.length;
341
      
342
      try (Connection conn = DriverManager.getConnection(jdbcURL, "fwd", "fwd"))
343
      {
344
         Statement stmt = conn.createStatement();
345
         setup.execute(stmt, coll);
346
         
347
         PreparedStatement ps = conn.prepareStatement(insert);
348
         for (int i = 1; i <= len; i++)
349
         {
350
            String s = encoded[i - 1];
351
            ps.setInt(1, i);
352
            ps.setString(2, s);
353
            ps.setString(3, s);
354
            ps.addBatch();
355
         }
356
         
357
         ps.executeBatch();
358
         ps.close();
359
         
360
         // case-sensitive sort
361
         generateOutput(stmt, true, true);
362
         generateOutput(stmt, true, false);
363
         
364
         // case-insensitive sort
365
         generateOutput(stmt, false, true);
366
         generateOutput(stmt, false, false);
367
      }
368
      catch (SQLException | IOException exc)
369
      {
370
         exc.printStackTrace();
371
      }
372
   }
373
   
374
   /**
375
    * Functional interface which accepts a SQL statement and possibly throws {@code SQLException}.
376
    */
377
   private static interface SQLFunc
378
   {
379
      void execute(Statement stmt, String newCollation)
380
      throws SQLException;
381
   }
382
}