Project

General

Profile

CustomCRC16Encoder.java

compatible Java ENCODE implementation (NOT integrated into P2J) - Greg Shah, 11/04/2014 03:01 PM

Download (23.2 KB)

 
1

    
2
import java.io.*;
3
import java.util.*;
4

    
5
/**
6
 * Progress 4GL compatible <code>ENCODE</code> algorithm.
7
 */
8
public class CustomCRC16Encoder
9
{
10
   /** Initial CRC value. */
11
   public static final int INITIAL_CRC_VALUE = 0x11;
12
   
13
   /** Number of bytes in the encoded output. */
14
   public static final int OUTPUT_ARRAY_SIZE = 16;
15
   
16
   /** Number of iterations in the core encoding loop. */
17
   public static final int CORE_LOOP_PASSES = 5;
18
   
19
   /** Reversed polynomial used with the CRC-16 algorithm. */
20
   public static final int REVERSED_POLYNOMIAL = 0xA001;
21
   
22
   /**
23
    * Progress 4GL compatible <code>ENCODE</code> algorithm which takes a source byte array
24
    * (of any size) and generates a 16-byte one-way hash using a proprietary approach
25
    * combined with a variant of CRC-16.  The basic approach is to spread out the source data
26
    * in an intermediate accumulator array of 16 unsigned bytes and to repeatedly CRC that data
27
    * and copy the resulting 2-byte CRC into successive elements of this intermediate
28
    * accumulator, while building the next CRC results on the accumulated CRC value and the
29
    * recently modified intermediate accumulator.  This set of CRCs is calculated 8 times
30
    * (since that will generate 16 bytes).  This basic approach is executed 5 times in a row.
31
    * Each byte of the resulting hashed intermediate accumulator array is then translated into
32
    * one of the 52 possible uppercase or lowercase English alphabetic characters. That
33
    * translated result is returned.
34
    * <p>
35
    * The detailed algorithm:
36
    * <p>
37
    * <ol>
38
    *   <li> Initialize storage that can hold at least 2 unsigned bytes to 0x11. This is the
39
    *        accumulator for the calculated CRC.
40
    *   <li> Allocate storage that can hold 16 unsigned bytes.  This will be used as an
41
    *        intermediate output accumulator which will combine the input data and the
42
    *        calculated CRC data in a manner that will result in each of the 16 bytes having a
43
    *        value between 0 and 255 inclusive.
44
    *   <li> Iterate the following steps 5 times:
45
    *      <ol>
46
    *         <li>Walk forward through the array of source data provided and XOR each byte into
47
    *             the intermediate output accumulator (target array), but with the index of the
48
    *             target matching a backwards walk. Since the source array can be of any length,
49
    *             the lower index positions in the target may not ever get source data XOR'd.
50
    *             Source arrays longer than the target array cause wrapping and that means that
51
    *             the higher index positions in the target array will have data from multiple
52
    *             source positions XOR'd. See the {@link #sourceToTargetXor} method for details
53
    *             as well as the significant cryptographic limitations this causes.
54
    *         <li>Iterate 8 times, doing the following:
55
    *            <ol>
56
    *               <li> Call the {@link #crc16} method, passing in both the intermediate output
57
    *                    accumulator as well as the CRC accumulator.  This is the core CRC
58
    *                    algorithm and it will calculate the CRC of the current state of the
59
    *                    intermediate output accumulator while including the initial state of
60
    *                    the CRC accumulator.  The resulting returned CRC result is assigned
61
    *                    back to the CRC accumulator.  The CRC-16 algorithm used is not
62
    *                    cryptographically sound.  It generates a non-uniform distribution of
63
    *                    hashed data and collisions are not rare.
64
    *               <li> The least significant byte (byte 0) of the CRC accumulator will be
65
    *                    directly stored into an index position in the intermediate output
66
    *                    accumulator. This index position starts at 0 and increments by 2 for
67
    *                    every iteration of the nearest containing loop.  Since the intermediate
68
    *                    output accumulator has 16 elements, that is why the containing loop
69
    *                    iterates only 8 times.  This byte must be treated as an unsigned value.
70
    *               <li> The next to least significant byte (byte 1) of the CRC accumulator will
71
    *                    be directly stored into an index position in the intermediate output
72
    *                    accumulator. This index position starts at 1 and increments by 2 for
73
    *                    every iteration of the nearest containing loop. This byte must be
74
    *                    treated as an unsigned value.
75
    *            </ol>
76
    *      </ol>
77
    *   <li> At this point the intermediate output accumulator contains 16 poorly distributed
78
    *        hashed bytes.  Translate each byte of the intermediate output accumulator into
79
    *        a byte from the valid output character set.  Valid output characters include all
80
    *        English alphabetic letters (both uppercase and lowercase), for 52 possible output
81
    *        characters.  See the {@link #translateToAlpha} method for details on this process
82
    *        and for the severe limitations of the resulting non-uniform distribution.
83
    * </ol>
84
    * <p>
85
    * The result of this algorithm is NOT cryptographically sound.  It should not be used for
86
    * secure purposes.  The flaws in this algorithm come from the processing associated with
87
    * {@link #sourceToTargetXor}, {@link #crc16} and {@link #translateToAlpha}.
88
    *
89
    * @param    data
90
    *           The data to be encoded.  This must not be <code>null</code>.  The array may be
91
    *           of any length.
92
    *
93
    * @return   16 encoded bytes where each byte is one of the 52 possible output characters.
94
    */
95
   public static byte[] encode(int[] data)
96
   {
97
      short[] storage = new short[OUTPUT_ARRAY_SIZE];
98
      
99
      int crc    = INITIAL_CRC_VALUE;
100
      int passes = CORE_LOOP_PASSES;
101
      
102
      // the input data is truncated at the first encountered null character
103
      for (int n = 0; n < data.length; n++)
104
      {
105
         if (data[n] == 0)
106
         {
107
            data = Arrays.copyOf(data, n);
108
            break;
109
         }
110
      }
111
      
112
      while (passes != 0)
113
      {
114
         sourceToTargetXor(data, storage);
115
         
116
         int idx = 0;
117
         
118
         while (idx < storage.length)
119
         {
120
            crc = crc16(storage, crc);
121
            
122
            storage[idx++] = (byte)(crc & 0xFF);
123
            storage[idx++] = (byte)((crc >> 8) & 0xFF);
124
         }
125
         
126
         passes--;
127
      }
128
      
129
      return translateToAlpha(storage);
130
   }
131
   
132
   /**
133
    * Walk forward through the array of source data provided and XOR each byte into the target
134
    * array, but with the index of the target matching a backwards walk.  The source data can
135
    * be an array of any length and the target length is not expected to match.  The target
136
    * indexes are calculated modulo the length of the target array, which means that for a source
137
    * array longer than the target array, one or more target array elements will store data XOR'd
138
    * from more than one source source array element (i.e. the XOR processing will wrap around).
139
    * For a source array smaller than the target array, there will be some low index elements
140
    * of the target array which will not receive any XOR'd data.  Only in the case where the
141
    * two arrays are the same length will there be no wrapping and no unmerged elements in the
142
    * target array.  Only in the case where the source array is modulo the size of the target
143
    * array will all elements of the target array be modified.
144
    * <p>
145
    * From a cryptographic perspective, the algorithm is quite poor.  Source arrays smaller
146
    * than the target arrays will leave target array bytes unmodified.  Source arrays larger
147
    * than the target array size will have some (or all) elements modified multiple times.
148
    * Both cases are causes of concern.  Not modifying elements means that there is less
149
    * input provided for distribution of results.  Modifying elements more than once can
150
    * cause issues as well.  Consider the case where the same character appears in the source
151
    * array in the first byte (index 0) and in the byte modulo the size of the target.  Because
152
    * of wrapping, this character will be XOR'd twice into the same element of the target
153
    * array.  XOR is its own inverse.  If you XOR the same data into a byte an even number of
154
    * times, then the resulting byte will be unchanged. This will have the effect of reducing
155
    * the distribution of the resulting data for some inputs. This is not suitable for secure
156
    * purposes.
157
    *
158
    * @param    source
159
    *           Source array to XOR from. Must not be <code>null</code>. May be of any length.
160
    * @param    target
161
    *           Target array to XOR into. Must not be <code>null</code>. May be of any length.    
162
    */
163
   public static void sourceToTargetXor(int[] source, short[] target)
164
   {
165
      int len = target.length;
166
      int max = len - 1;
167
      
168
      for (int i = 0; i < source.length; i++)
169
      {
170
         target[max - (i % len)] ^= source[i] & 0xFF;
171
      }
172
   }
173
   
174
   /**
175
    * This implements a standard CRC-16 (also known as CRC-16-IBM or CRC-16-IBM) algorithm that
176
    * uses a reversed polynomial of 0xA001 and swapped byte ordering. Using a reversed polynomial
177
    * means it processes each byte's least significant bit first (it shifts the binary data to
178
    * the right). Using swapped byte ordering means that the input data (which is being treated
179
    * as an arbitrarily large binary number) must be processed from highest element to lowest
180
    * element, whereas normally one might consider the most significant byte to be in the highest
181
    * array index position, this algorithm assumes the opposite.
182
    * <p>
183
    * If this algorithm generated uniformly distributed hashes, then in a best case scenario the
184
    * probability of a collision between any 2 items is (1 / 2^16) or .00152588 %.  That seems
185
    * good until one considers that between any 300 items there is a 50% chance of collisions
186
    * and between any 430 items there is a 75% chance of a collision!  This is the well known
187
    * birthday problem (see http://en.wikipedia.org/wiki/Birthday_attack).
188
    * <p>
189
    * Please note that the CRC-16 algorithm is not suitable for cryptographic hashing.  It does
190
    * NOT generate uniformly distributed hashes.  This means that the collision rate is not even
191
    * as good as the best case scenario.  Even if it did have uniform distribution, the small
192
    * number of bits means that collisions are not very rare.  CRC is more suitable for error
193
    * detection.  It should NOT be used for secure purposes.
194
    *
195
    * @param    data
196
    *           The bytes of data to CRC.
197
    * @param    crc
198
    *           The initial CRC value into which each element of the data will be XOR'd.
199
    *
200
    * @return   The calculated CRC value after factoring in all data bytes and binary dividing
201
    *           the polynomial.
202
    */
203
   public static int crc16(short[] data, int crc)
204
   {
205
      // iterate from the top of the input array to the bottom
206
      for (int idx = (data.length - 1); idx >= 0; idx--)
207
      {
208
         // XOR the input data into the crc
209
         crc ^= (int) (data[idx] & 0xFF);
210
         
211
         int bit = 7;
212
         
213
         // process each bit
214
         while (bit >= 0)
215
         {
216
            // check if the least significant bit is set (must be done before shifting)
217
            boolean lsb = ((crc & 0x01) == 1);
218
            
219
            // shift the data right by one bit
220
            crc >>= 1;
221
            
222
            if (lsb)
223
            {
224
               // the least significant bit was set, XOR the polynomial into the crc 
225
               crc ^= REVERSED_POLYNOMIAL;
226
            }
227
            
228
            bit--;
229
         }
230
      }
231
      
232
      return crc;
233
   }
234
   
235
   /**
236
    * Convert the source array into an array of bytes with the same number of elements, but
237
    * where each byte may only contain uppercase or lowercase English alphabetic characters
238
    * (a-z and A-Z).
239
    * <p>
240
    * The source array will have each element translated into a corresponding element in the
241
    * output array.  Only the least significant byte of the source array element is considered
242
    * in the translation algorithm.  The order of the elements in the output array will be
243
    * the same order as the source array (e.g. the first source element translates to the
244
    * first output element and so forth).
245
    * <p>
246
    * The following algorithm is used to translate the 256 possible source byte values into
247
    * the 52 possible output byte values:
248
    * <p>
249
    * <ol>
250
    *   <li> If the least significant 7 bits of the source byte are one of the 52 possible
251
    *        English alphabetic characters, then that is the resulting byte.  This will
252
    *        yield a byte with 0x41 ('A') through 0x5A ('Z') or 0x61 ('a') - 0x7A ('z'),
253
    *        inclusive.
254
    *   <li> Otherwise, the most significant (upper) nibble of the source byte will be used
255
    *        to select a character from 'a' through 'q'.  This nibble can only have one of
256
    *        16 possible values (0 through 15).  This nibble value is used as a direct
257
    *        index into the 16 possible lowercase output letters.  Thus, a nibble value of
258
    *        0 (0x0) will yield 'a' (0x61), 1 (0x1) will yield 'b' (0x62) and so on, with
259
    *        the last possible value of 15 (0xF) yielding 'p' (0x70).
260
    * </ol>
261
    * <p>
262
    * Assuming the source bytes are uniformly distributed, this translation approach is
263
    * guaranteed to result in a highly UNEVEN distribution of output bytes.  This is
264
    * cryptographically BAD and should NOT be used for secure purposes. To understand
265
    * why this occurs:
266
    * <p>
267
    * <pre>
268
    * Source Byte Range    Output Byte                               Distribution Notes
269
    * -----------------    -----------    ----------------------------------------------------------------------------   
270
    * 0x00 - 0x40          'a' - 'e'      'a' through 'd' are 16X more likely than 'e'
271
    * 0x41 - 0x5A          'A' - 'Z'      all equally likely (1 to 1 mapping of input and output)
272
    * 0x5B - 0x60          'f' - 'g'      'f' 5X more likely than 'g'
273
    * 0x61 - 0x7A          'a' - 'z'      all equally likely (1 to 1 mapping of input and output)
274
    * 0x7B - 0xC0          'h' - 'm'      'i' through 'l' are 16X more likely than 'm', 'h' is 5X more likely than 'm'
275
    * 0xC1 - 0xDA          'A' - 'Z'      all equally likely (1 to 1 mapping of input and output)
276
    * 0xDB - 0xE0          'n' - 'o'      'n' 5X more likely than 'o'
277
    * 0xE1 - 0xFA          'a' - 'z'      all equally likely (1 to 1 mapping of input and output)
278
    * 0xFB - 0xFF          'p'            only 'p' is possible
279
    * </pre>
280
    * <p>
281
    * More graphically, here is the exact distribution that will occur for perfectly distributed input:
282
    * <p>
283
    * <pre>
284
    * Character   Frequency          Histogram
285
    * ---------   ---------   -----------------------
286
    * A           2           ++
287
    * B           2           ++
288
    * C           2           ++
289
    * D           2           ++
290
    * E           2           ++
291
    * F           2           ++
292
    * G           2           ++
293
    * H           2           ++
294
    * I           2           ++
295
    * J           2           ++
296
    * K           2           ++
297
    * L           2           ++
298
    * M           2           ++
299
    * N           2           ++
300
    * O           2           ++
301
    * P           2           ++
302
    * Q           2           ++
303
    * R           2           ++
304
    * S           2           ++
305
    * T           2           ++
306
    * U           2           ++
307
    * V           2           ++
308
    * W           2           ++
309
    * X           2           ++
310
    * Y           2           ++
311
    * Z           2           ++
312
    * a           18          ++++++++++++++++++
313
    * b           18          ++++++++++++++++++
314
    * c           18          ++++++++++++++++++
315
    * d           18          ++++++++++++++++++
316
    * e           3           +++
317
    * f           7           +++++++
318
    * g           3           +++
319
    * h           7           +++++++
320
    * i           18          ++++++++++++++++++
321
    * j           18          ++++++++++++++++++
322
    * k           18          ++++++++++++++++++
323
    * l           18          ++++++++++++++++++
324
    * m           3           +++
325
    * n           7           +++++++
326
    * o           3           +++
327
    * p           7           +++++++
328
    * q           2           ++
329
    * r           2           ++
330
    * s           2           ++
331
    * t           2           ++
332
    * u           2           ++
333
    * v           2           ++
334
    * w           2           ++
335
    * x           2           ++
336
    * y           2           ++
337
    * z           2           ++
338
    * </pre>
339
    * <p>
340
    * This non-uniform distribution is purely due to the fallback approach of using the
341
    * upper nibble of some bytes as a selector into a subset of the possible output values.
342
    * Since the only a subset of the output values are targeted, it makes those values
343
    * more likely to occur.  In addition, because there are some upper nibble values that
344
    * are less likely to be encountered (because they rarely trigger the fallback mechanism
345
    * in the first place), this causes an additional level of non-uniformity even within
346
    * the subset that is possible to be selected.
347
    *
348
    * @param    source
349
    *           The bytes to convert.  Must not be <code>null</code>.  Only the least
350
    *           significant byte of each element will be used.
351
    *
352
    * @return   The converted array of bytes, with one element for each corresponding
353
    *           element of the source array.
354
    */
355
   public static byte[] translateToAlpha(short[] source)
356
   {
357
      byte[] target = new byte[source.length];
358
      
359
      for (int idx = 0; idx < source.length; idx++)
360
      {
361
         target[idx] = (byte)(source[idx] & 0x7F);
362
         
363
         if ((target[idx] < 'A' || (target[idx] > 'Z' && target[idx] < 'a') || target[idx] > 'z'))
364
         {
365
            target[idx] = (byte)(((source[idx] & 0xFF) >> 4) + 'a');
366
         }
367
      }
368
      
369
      return target;
370
   }
371
   
372
   /**
373
    * Print the command line syntax help.
374
    */
375
   private static void syntax()
376
   {
377
      System.out.println("Syntax: java CustomCRC16Encoder <mode> " +
378
                         "{<filename_of_binary_encoded_input_strings> | " +
379
                         "<input_string_to_hash>} [<input_string_to_hash> ...]");
380
      System.out.println("Where: <mode> is B (for binary file input), " +
381
                         "N (no-nulls binary file input), " +
382
                         "T (text file input) or A (for argument mode)");
383
   }
384
   
385
   /**
386
    * Command line test program.
387
    */
388
   public static void main(String[] args)
389
   {    
390
      if (args.length < 2)
391
      {
392
         syntax();
393
         System.exit(-1);
394
      }
395
      
396
      boolean honorNull = true;
397
      
398
      if ("n".equalsIgnoreCase(args[0]))
399
      {
400
         args[0] = "b";
401
         honorNull = false;
402
      }
403
      
404
      if ("b".equalsIgnoreCase(args[0]))
405
      {
406
         // read strings from a binary file
407
         InputStream in = null;
408
         
409
         try
410
         {
411
            File file = new File(args[1]);
412
            
413
            if (!file.exists() || file.isDirectory())
414
            {
415
               syntax();
416
               System.exit(-2);
417
            }
418
            
419
            in = new BufferedInputStream(new FileInputStream(file));
420
            
421
            int    remain = (int) file.length();
422
            int    idx    = 0;
423
            
424
            while (idx < remain)
425
            {
426
               // read 1 byte that tells us the size of the following binary string
427
               int sz = in.read();
428
               idx++;
429
               
430
               int[] data = new int[sz];
431
               
432
               int nulls = 0;
433
               
434
               // empty string is size 0
435
               for (int i = 0; i < sz; i++)
436
               {
437
                  // read in each byte (the value needs to be unsigned so we use int)
438
                  int next = in.read();
439
                  
440
                  // encode input in the 4GL is a character type which can only have embedded
441
                  // nulls chars using get-string() in a specific way, if you are using chr()
442
                  // or other forms then nulls won't be there and those null characters result
443
                  // in an empty string; this behavior is controlled with a flag to allow
444
                  // testing both ways
445
                  if (!honorNull && next == 0)
446
                  {
447
                     nulls++;
448
                     continue;
449
                  }
450
                  
451
                  data[i - nulls] = next;
452
               }
453
               
454
               // remove the unused array elements
455
               if (nulls > 0)
456
               {
457
                  data = Arrays.copyOf(data, (sz - nulls));
458
               }
459
            
460
               idx += sz;
461
               
462
               byte[] result = encode(data);
463
               
464
               System.out.println(new String(result));
465
            }
466
         }
467
         
468
         catch (IOException ioe)
469
         {
470
            ioe.printStackTrace();
471
         }
472
         
473
         finally
474
         {
475
            try
476
            {
477
               if (in != null)
478
                  in.close();
479
            }
480
            
481
            catch (IOException ioe)
482
            {
483
               // ignore
484
            }
485
         }
486
      }
487
      else if ("t".equalsIgnoreCase(args[0]))
488
      {
489
         // read strings from a text file
490
         BufferedReader reader = null;
491
         
492
         try
493
         {
494
            File file = new File(args[1]);
495
            
496
            if (!file.exists() || file.isDirectory())
497
            {
498
               syntax();
499
               System.exit(-2);
500
            }
501
            
502
            reader = new BufferedReader(new FileReader(file));
503
            
504
            String next = reader.readLine();
505
            
506
            while (next != null)
507
            {
508
               int len = next.length();
509
               
510
               int[] data = new int[len];
511
               
512
               for (int i = 0; i < len; i++)
513
               {
514
                  data[i] = (int) next.charAt(i);
515
               }
516
               
517
               byte[] result = encode(data);
518
               
519
               System.out.println(new String(result));
520
               
521
               // read 1 line w/o CR or LF
522
               next = reader.readLine();
523
            }
524
         }
525
         
526
         catch (IOException ioe)
527
         {
528
            ioe.printStackTrace();
529
         }
530
         
531
         finally
532
         {
533
            try
534
            {
535
               if (reader != null)
536
                  reader.close();
537
            }
538
            
539
            catch (IOException ioe)
540
            {
541
               // ignore
542
            }
543
         }
544
      }
545
      else
546
      {
547
         // strings are encoded in args
548
         for (int i = 1; i < args.length; i++)
549
         {
550
            byte[] txt  = args[i].getBytes();
551
            int[]  data = new int[txt.length]; 
552
            
553
            for (int k = 0; k < txt.length; k++)
554
            {
555
               data[k] = txt[k];
556
            }
557
            
558
            byte[] hashed = encode(data);
559
            System.out.printf("%60s = %16s\n", "'" + args[i] + "'", new String(hashed));
560
         }
561
      }
562
   }
563
}