Project

General

Profile

3871_2.patch

Sergey Ivanovskiy, 10/10/2019 03:01 PM

Download (8.13 KB)

View differences:

src/com/goldencode/p2j/schema/ImportWorker.java 2019-10-10 18:41:58 +0000
139 139
** 050 CA  20190905          The data file is not mapped to memory, but to a byte buffer.  This 
140 140
**                           allows very large dump files to be imported, without limitations from
141 141
**                           the physical memory.
142
** 051 SBI 20191008          Added the source converter for one byte encodings, set the source
143
**                           encoding.
142 144
*/
143 145
/*
144 146
** This program is free software: you can redistribute it and/or modify
......
198 200
import java.io.*;
199 201
import java.lang.InstantiationException;
200 202
import java.lang.reflect.*;
203
import java.nio.charset.*;
201 204
import java.sql.*;
202 205
import java.util.*;
203 206
import java.util.logging.*;
207

  
208
import com.goldencode.p2j.util.*;
204 209
import com.goldencode.p2j.util.ErrorManager;
210

  
205 211
import org.hibernate.*;
206 212
import org.hibernate.cfg.*;
207 213
import org.hibernate.dialect.*;
208 214
import org.hibernate.service.*;
209 215
import org.hibernate.type.*;
210 216
import org.hibernate.type.Type;
217

  
211 218
import com.goldencode.p2j.pattern.*;
212 219
import com.goldencode.p2j.persist.*;
213 220
import com.goldencode.p2j.persist.type.*;
......
1026 1033
               
1027 1034
               // Open a new session and begin a transaction.
1028 1035
               session = openSession();
1036
               
1029 1037
               Transaction tx = session.beginTransaction();
1030 1038
               
1039
               Query charsetQuery = session.createSQLQuery(
1040
                        "SELECT character_set_name FROM information_schema.character_sets;");
1041
               Object targetCharset = charsetQuery.list().get(0);
1042
               
1043
               if (targetCharset instanceof String)
1044
               {
1045
                  stream.setConvertTarget((String) targetCharset);
1046
               }
1031 1047
               // Read up to batchSize records from input file.
1032 1048
               for (int i = records.size(); i < batchSize && !eof && !recovery; i++, counter++)
1033 1049
               {
......
2904 2920
      /** The date format  as it was set when the table was dumped. */
2905 2921
      private String dateFormat = null;
2906 2922
      
2923
      /** The import source charset converter */
2924
      private CharsetConverter charsetConverter;
2925
      
2907 2926
      /**
2908 2927
       * Constructor.
2909 2928
       * 
......
2934 2953
            // If any IO issue would occur, it should have happened in the super c'tor.
2935 2954
            ErrorManager.recordOrThrowError(98, "Unable to open file:" + filename + ".");
2936 2955
         }
2956
         
2957
         charsetConverter = createSourceCharsetConverter();
2958
         
2959
         boolean isUTF8 = isSourceCodePageUTF8();
2960
         
2961
         setUtf8Mode(isUTF8);
2962
         
2963
         // take into account the target conversion that depends on this mode
2964
         setConvert((charsetConverter != null) || isUTF8);
2937 2965
      }
2938 2966
      
2939 2967
      /**
......
2980 3008
      }
2981 3009
      
2982 3010
      /**
3011
       * Returns the charset converter.
3012
       * 
3013
       * @return   The charset converter
3014
       */
3015
      protected CharsetConverter getCharsetConverter()
3016
      {
3017
         if (charsetConverter != null || !convert)
3018
         {
3019
            return charsetConverter;
3020
         }
3021
         
3022
         return super.getCharsetConverter();
3023
      }
3024

  
3025
      /**
2983 3026
       * Reads the PSC footer, storing the key/values pairs in private map pscHeader.
2984 3027
       * 
2985 3028
       * @return  The number of PSC records actually read. If negative, the footer could not 
......
3082 3125
         encoding    = getMetadata("cpstream");    // eg. ISO8859-15
3083 3126
         // cc = new CharsetConverter(encoding);
3084 3127
         
3128
         setConvertSource(encoding);
3129
         
3085 3130
         ldbname     = getMetadata("ldbname");     // eg. p2j_test
3086 3131
         timestamp   = getMetadata("timestamp");   // eg. 2013/06/07-09:57:02
3087 3132
         
src/com/goldencode/p2j/util/FileStream.java 2019-10-10 18:50:03 +0000
37 37
** 019 ECF 20171026          Added write(byte[], int, int) method.
38 38
** 020 EVL 20180620          Adding pulse on close for empty frames.
39 39
** 021 CA  20190905          Allow byte buffer instead of memory buffers for read-only files.
40
** 022 SBI 20191008          Added getCharsetConverter(), createSourceCharsetConverter() and
41
**                           isSourceCodePageUTF8().
40 42
*/
41 43

  
42 44
/*
......
97 99
import java.io.*;
98 100
import java.nio.*;
99 101
import java.nio.channels.*;
102
import java.nio.charset.*;
103

  
100 104

  
101 105
/**
102 106
 * A stream class supporting input and output semantics for any file-like
......
156 160
   /** Number of bytes in the write buffer (non-memory-mapped mode). */
157 161
   private int pending = 0;
158 162
   
163
   protected boolean utf8Mode;
164
   
159 165
   /**
160 166
    * Constructs an instance using a filename, this file or device will be
161 167
    * opened for reading or writing based on the given <code>write</code>
......
857 863
   }
858 864
   
859 865
   /**
866
    * Returns the charset converter.
867
    * 
868
    * @return   The charset converter
869
    */
870
   protected CharsetConverter getCharsetConverter()
871
   {
872
      return cc;
873
   }
874
   
875
   /**
876
    * Creates the source charset converter if the source code page is one byte encoding, otherwise
877
    * returns null value.
878
    * 
879
    * @return   The source charset converter
880
    */
881
   protected CharsetConverter createSourceCharsetConverter()
882
   {
883
      CharsetConverter charsetConverter;
884
      
885
      try
886
      {
887
         Charset charset = Charset.forName(sourceCp);
888
         
889
         if (charset.newEncoder().maxBytesPerChar() == 1)
890
         {
891
            charsetConverter = new CharsetConverter(sourceCp);
892
         }
893
         else
894
         {
895
            charsetConverter = null;
896
         }
897
      }
898
      catch(IllegalArgumentException | UnsupportedOperationException e)
899
      {
900
         charsetConverter = null;
901
      }
902

  
903
      return charsetConverter;
904
   }
905
   
906
   /**
907
    * Tests if the source code page is UTF-8.
908
    * 
909
    * @return   true if the source code page is UTF-8, otherwise false.
910
    */
911
   protected boolean isSourceCodePageUTF8()
912
   {
913
      Charset cs = Charset.forName(sourceCp);
914
      
915
      return StandardCharsets.UTF_8.equals(cs);
916
   }
917
   
918
   /**
919
    * Sets the UTF-8 mode when the read character can be encoded in a sequence of 4 bytes.
920
    * 
921
    * @param    utf8Mode
922
    *           The flag indicating if this file stream is in UTF-8 mode.
923
    */
924
   protected void setUtf8Mode(boolean utf8Mode)
925
   {
926
      this.utf8Mode = utf8Mode;
927
   }
928
   
929
   /**
860 930
    * Write a byte to the buffer, flushing if the buffer is full.
861 931
    *
862 932
    * @param    b
......
959 1029
   {
960 1030
      int ch = readWorker(false);
961 1031
      
962
      if (convert && ch >= 0)
1032
      if (convert && !utf8Mode && ch >= 0)
963 1033
      {
964
         ch = cc.toChar(ch);
1034
         ch = getCharsetConverter().toChar(ch);
965 1035
      }
966 1036
      
967 1037
      return ch;
......
997 1067
         {
998 1068
            mem.mark();
999 1069
         }
1000
         ch = (mem.get() & 0x000000FF);
1070
         
1071
         byte b0 = mem.get();
1072
         ch = (b0 & 0x000000FF);
1073
         
1074
         if (utf8Mode)
1075
         {
1076
            byte[] utf8Bytes = null;
1077
            
1078
            int prefix = ch >> 4;
1079
      
1080
            if ((prefix >> 1) == 0b00000110)
1081
            {
1082
               byte b1 = mem.get();
1083
               utf8Bytes = new byte[] { b0, b1};
1084
               
1085
            }
1086
            else if (prefix == 0b00001110)
1087
            {
1088
               byte b1 = mem.get();
1089
               byte b2 = mem.get();
1090
               utf8Bytes = new byte[] { b0, b1, b2};
1091
            }
1092
            else if (prefix == 0b00001111)
1093
            {
1094
               byte b1 = mem.get();
1095
               byte b2 = mem.get();
1096
               byte b3 = mem.get();
1097
               utf8Bytes = new byte[] { b0, b1, b2, b3};
1098
            }
1099
            
1100
            if (utf8Bytes != null)
1101
            {
1102
               ch = new String(utf8Bytes, StandardCharsets.UTF_8).codePointAt(0);
1103
            }
1104
         }
1105
         
1001 1106
         if (peek)
1002 1107
         {
1003 1108
            mem.reset();