3871_2.patch
src/com/goldencode/p2j/schema/ImportWorker.java 2019-10-10 18:41:58 +0000 | ||
---|---|---|
139 | 139 |
** 050 CA 20190905 The data file is not mapped to memory, but to a byte buffer. This |
140 | 140 |
** allows very large dump files to be imported, without limitations from |
141 | 141 |
** the physical memory. |
142 |
** 051 SBI 20191008 Added the source converter for one byte encodings, set the source |
|
143 |
** encoding. |
|
142 | 144 |
*/ |
143 | 145 |
/* |
144 | 146 |
** This program is free software: you can redistribute it and/or modify |
... | ... | |
198 | 200 |
import java.io.*; |
199 | 201 |
import java.lang.InstantiationException; |
200 | 202 |
import java.lang.reflect.*; |
203 |
import java.nio.charset.*; |
|
201 | 204 |
import java.sql.*; |
202 | 205 |
import java.util.*; |
203 | 206 |
import java.util.logging.*; |
207 | ||
208 |
import com.goldencode.p2j.util.*; |
|
204 | 209 |
import com.goldencode.p2j.util.ErrorManager; |
210 | ||
205 | 211 |
import org.hibernate.*; |
206 | 212 |
import org.hibernate.cfg.*; |
207 | 213 |
import org.hibernate.dialect.*; |
208 | 214 |
import org.hibernate.service.*; |
209 | 215 |
import org.hibernate.type.*; |
210 | 216 |
import org.hibernate.type.Type; |
217 | ||
211 | 218 |
import com.goldencode.p2j.pattern.*; |
212 | 219 |
import com.goldencode.p2j.persist.*; |
213 | 220 |
import com.goldencode.p2j.persist.type.*; |
... | ... | |
1026 | 1033 |
|
1027 | 1034 |
// Open a new session and begin a transaction. |
1028 | 1035 |
session = openSession(); |
1036 |
|
|
1029 | 1037 |
Transaction tx = session.beginTransaction(); |
1030 | 1038 |
|
1039 |
Query charsetQuery = session.createSQLQuery( |
|
1040 |
"SELECT character_set_name FROM information_schema.character_sets;"); |
|
1041 |
Object targetCharset = charsetQuery.list().get(0); |
|
1042 |
|
|
1043 |
if (targetCharset instanceof String) |
|
1044 |
{ |
|
1045 |
stream.setConvertTarget((String) targetCharset); |
|
1046 |
} |
|
1031 | 1047 |
// Read up to batchSize records from input file. |
1032 | 1048 |
for (int i = records.size(); i < batchSize && !eof && !recovery; i++, counter++) |
1033 | 1049 |
{ |
... | ... | |
2904 | 2920 |
/** The date format as it was set when the table was dumped. */ |
2905 | 2921 |
private String dateFormat = null; |
2906 | 2922 |
|
2923 |
/** The import source charset converter */ |
|
2924 |
private CharsetConverter charsetConverter; |
|
2925 |
|
|
2907 | 2926 |
/** |
2908 | 2927 |
* Constructor. |
2909 | 2928 |
* |
... | ... | |
2934 | 2953 |
// If any IO issue would occur, it should have happened in the super c'tor. |
2935 | 2954 |
ErrorManager.recordOrThrowError(98, "Unable to open file:" + filename + "."); |
2936 | 2955 |
} |
2956 |
|
|
2957 |
charsetConverter = createSourceCharsetConverter(); |
|
2958 |
|
|
2959 |
boolean isUTF8 = isSourceCodePageUTF8(); |
|
2960 |
|
|
2961 |
setUtf8Mode(isUTF8); |
|
2962 |
|
|
2963 |
// take into account the target conversion that depends on this mode |
|
2964 |
setConvert((charsetConverter != null) || isUTF8); |
|
2937 | 2965 |
} |
2938 | 2966 |
|
2939 | 2967 |
/** |
... | ... | |
2980 | 3008 |
} |
2981 | 3009 |
|
2982 | 3010 |
/** |
3011 |
* Returns the charset converter. |
|
3012 |
* |
|
3013 |
* @return The charset converter |
|
3014 |
*/ |
|
3015 |
protected CharsetConverter getCharsetConverter() |
|
3016 |
{ |
|
3017 |
if (charsetConverter != null || !convert) |
|
3018 |
{ |
|
3019 |
return charsetConverter; |
|
3020 |
} |
|
3021 |
|
|
3022 |
return super.getCharsetConverter(); |
|
3023 |
} |
|
3024 | ||
3025 |
/** |
|
2983 | 3026 |
* Reads the PSC footer, storing the key/values pairs in private map pscHeader. |
2984 | 3027 |
* |
2985 | 3028 |
* @return The number of PSC records actually read. If negative, the footer could not |
... | ... | |
3082 | 3125 |
encoding = getMetadata("cpstream"); // eg. ISO8859-15 |
3083 | 3126 |
// cc = new CharsetConverter(encoding); |
3084 | 3127 |
|
3128 |
setConvertSource(encoding); |
|
3129 |
|
|
3085 | 3130 |
ldbname = getMetadata("ldbname"); // eg. p2j_test |
3086 | 3131 |
timestamp = getMetadata("timestamp"); // eg. 2013/06/07-09:57:02 |
3087 | 3132 |
|
src/com/goldencode/p2j/util/FileStream.java 2019-10-10 18:50:03 +0000 | ||
---|---|---|
37 | 37 |
** 019 ECF 20171026 Added write(byte[], int, int) method. |
38 | 38 |
** 020 EVL 20180620 Adding pulse on close for empty frames. |
39 | 39 |
** 021 CA 20190905 Allow byte buffer instead of memory buffers for read-only files. |
40 |
** 022 SBI 20191008 Added getCharsetConverter(), createSourceCharsetConverter() and |
|
41 |
** isSourceCodePageUTF8(). |
|
40 | 42 |
*/ |
41 | 43 | |
42 | 44 |
/* |
... | ... | |
97 | 99 |
import java.io.*; |
98 | 100 |
import java.nio.*; |
99 | 101 |
import java.nio.channels.*; |
102 |
import java.nio.charset.*; |
|
103 | ||
100 | 104 | |
101 | 105 |
/** |
102 | 106 |
* A stream class supporting input and output semantics for any file-like |
... | ... | |
156 | 160 |
/** Number of bytes in the write buffer (non-memory-mapped mode). */ |
157 | 161 |
private int pending = 0; |
158 | 162 |
|
163 |
protected boolean utf8Mode; |
|
164 |
|
|
159 | 165 |
/** |
160 | 166 |
* Constructs an instance using a filename, this file or device will be |
161 | 167 |
* opened for reading or writing based on the given <code>write</code> |
... | ... | |
857 | 863 |
} |
858 | 864 |
|
859 | 865 |
/** |
866 |
* Returns the charset converter. |
|
867 |
* |
|
868 |
* @return The charset converter |
|
869 |
*/ |
|
870 |
protected CharsetConverter getCharsetConverter() |
|
871 |
{ |
|
872 |
return cc; |
|
873 |
} |
|
874 |
|
|
875 |
/** |
|
876 |
* Creates the source charset converter if the source code page is one byte encoding, otherwise |
|
877 |
* returns null value. |
|
878 |
* |
|
879 |
* @return The source charset converter |
|
880 |
*/ |
|
881 |
protected CharsetConverter createSourceCharsetConverter() |
|
882 |
{ |
|
883 |
CharsetConverter charsetConverter; |
|
884 |
|
|
885 |
try |
|
886 |
{ |
|
887 |
Charset charset = Charset.forName(sourceCp); |
|
888 |
|
|
889 |
if (charset.newEncoder().maxBytesPerChar() == 1) |
|
890 |
{ |
|
891 |
charsetConverter = new CharsetConverter(sourceCp); |
|
892 |
} |
|
893 |
else |
|
894 |
{ |
|
895 |
charsetConverter = null; |
|
896 |
} |
|
897 |
} |
|
898 |
catch(IllegalArgumentException | UnsupportedOperationException e) |
|
899 |
{ |
|
900 |
charsetConverter = null; |
|
901 |
} |
|
902 | ||
903 |
return charsetConverter; |
|
904 |
} |
|
905 |
|
|
906 |
/** |
|
907 |
* Tests if the source code page is UTF-8. |
|
908 |
* |
|
909 |
* @return true if the source code page is UTF-8, otherwise false. |
|
910 |
*/ |
|
911 |
protected boolean isSourceCodePageUTF8() |
|
912 |
{ |
|
913 |
Charset cs = Charset.forName(sourceCp); |
|
914 |
|
|
915 |
return StandardCharsets.UTF_8.equals(cs); |
|
916 |
} |
|
917 |
|
|
918 |
/** |
|
919 |
* Sets the UTF-8 mode when the read character can be encoded in a sequence of 4 bytes. |
|
920 |
* |
|
921 |
* @param utf8Mode |
|
922 |
* The flag indicating if this file stream is in UTF-8 mode. |
|
923 |
*/ |
|
924 |
protected void setUtf8Mode(boolean utf8Mode) |
|
925 |
{ |
|
926 |
this.utf8Mode = utf8Mode; |
|
927 |
} |
|
928 |
|
|
929 |
/** |
|
860 | 930 |
* Write a byte to the buffer, flushing if the buffer is full. |
861 | 931 |
* |
862 | 932 |
* @param b |
... | ... | |
959 | 1029 |
{ |
960 | 1030 |
int ch = readWorker(false); |
961 | 1031 |
|
962 |
if (convert && ch >= 0) |
|
1032 |
if (convert && !utf8Mode && ch >= 0)
|
|
963 | 1033 |
{ |
964 |
ch = cc.toChar(ch);
|
|
1034 |
ch = getCharsetConverter().toChar(ch);
|
|
965 | 1035 |
} |
966 | 1036 |
|
967 | 1037 |
return ch; |
... | ... | |
997 | 1067 |
{ |
998 | 1068 |
mem.mark(); |
999 | 1069 |
} |
1000 |
ch = (mem.get() & 0x000000FF); |
|
1070 |
|
|
1071 |
byte b0 = mem.get(); |
|
1072 |
ch = (b0 & 0x000000FF); |
|
1073 |
|
|
1074 |
if (utf8Mode) |
|
1075 |
{ |
|
1076 |
byte[] utf8Bytes = null; |
|
1077 |
|
|
1078 |
int prefix = ch >> 4; |
|
1079 |
|
|
1080 |
if ((prefix >> 1) == 0b00000110) |
|
1081 |
{ |
|
1082 |
byte b1 = mem.get(); |
|
1083 |
utf8Bytes = new byte[] { b0, b1}; |
|
1084 |
|
|
1085 |
} |
|
1086 |
else if (prefix == 0b00001110) |
|
1087 |
{ |
|
1088 |
byte b1 = mem.get(); |
|
1089 |
byte b2 = mem.get(); |
|
1090 |
utf8Bytes = new byte[] { b0, b1, b2}; |
|
1091 |
} |
|
1092 |
else if (prefix == 0b00001111) |
|
1093 |
{ |
|
1094 |
byte b1 = mem.get(); |
|
1095 |
byte b2 = mem.get(); |
|
1096 |
byte b3 = mem.get(); |
|
1097 |
utf8Bytes = new byte[] { b0, b1, b2, b3}; |
|
1098 |
} |
|
1099 |
|
|
1100 |
if (utf8Bytes != null) |
|
1101 |
{ |
|
1102 |
ch = new String(utf8Bytes, StandardCharsets.UTF_8).codePointAt(0); |
|
1103 |
} |
|
1104 |
} |
|
1105 |
|
|
1001 | 1106 |
if (peek) |
1002 | 1107 |
{ |
1003 | 1108 |
mem.reset(); |