1 /*
2 * Copyright 2006-2013 the original author or authors.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.springframework.batch.item.file.transform;
18
19 import java.util.ArrayList;
20 import java.util.Collection;
21 import java.util.HashSet;
22 import java.util.List;
23
24 import org.springframework.util.Assert;
25 import org.springframework.util.StringUtils;
26
27 /**
28 * A {@link LineTokenizer} implementation that splits the input String on a
29 * configurable delimiter. This implementation also supports the use of an
30 * escape character to escape delimiters and line endings.
31 *
32 * @author Rob Harrop
33 * @author Dave Syer
34 * @author Michael Minella
35 */
36 public class DelimitedLineTokenizer extends AbstractLineTokenizer {
37 /**
38 * Convenient constant for the common case of a tab delimiter.
39 */
40 public static final String DELIMITER_TAB = "\t";
41
42 /**
43 * Convenient constant for the common case of a comma delimiter.
44 */
45 public static final String DELIMITER_COMMA = ",";
46
47 /**
48 * Convenient constant for the common case of a " character used to escape
49 * delimiters or line endings.
50 */
51 public static final char DEFAULT_QUOTE_CHARACTER = '"';
52
53 // the delimiter character used when reading input.
54 private String delimiter;
55
56 private char quoteCharacter = DEFAULT_QUOTE_CHARACTER;
57
58 private String quoteString;
59
60 private Collection<Integer> includedFields = null;
61
62 /**
63 * Create a new instance of the {@link DelimitedLineTokenizer} class for the
64 * common case where the delimiter is a {@link #DELIMITER_COMMA comma}.
65 *
66 * @see #DelimitedLineTokenizer(String)
67 * @see #DELIMITER_COMMA
68 */
69 public DelimitedLineTokenizer() {
70 this(DELIMITER_COMMA);
71 }
72
73 /**
74 * Create a new instance of the {@link DelimitedLineTokenizer} class.
75 *
76 * @param delimiter the desired delimiter
77 */
78 public DelimitedLineTokenizer(String delimiter) {
79 Assert.state(!delimiter.equals(String.valueOf(DEFAULT_QUOTE_CHARACTER)), "[" + DEFAULT_QUOTE_CHARACTER
80 + "] is not allowed as delimiter for tokenizers.");
81
82 this.delimiter = delimiter;
83 setQuoteCharacter(DEFAULT_QUOTE_CHARACTER);
84 }
85
86 /**
87 * Setter for the delimiter character.
88 *
89 * @param delimiter
90 */
91 public void setDelimiter(String delimiter) {
92 this.delimiter = delimiter;
93 }
94
95 /**
96 * The fields to include in the output by position (starting at 0). By
97 * default all fields are included, but this property can be set to pick out
98 * only a few fields from a larger set. Note that if field names are
99 * provided, their number must match the number of included fields.
100 *
101 * @param includedFields the included fields to set
102 */
103 public void setIncludedFields(int[] includedFields) {
104 this.includedFields = new HashSet<Integer>();
105 for (int i : includedFields) {
106 this.includedFields.add(i);
107 }
108 }
109
110 /**
111 * Public setter for the quoteCharacter. The quote character can be used to
112 * extend a field across line endings or to enclose a String which contains
113 * the delimiter. Inside a quoted token the quote character can be used to
114 * escape itself, thus "a""b""c" is tokenized to a"b"c.
115 *
116 * @param quoteCharacter the quoteCharacter to set
117 *
118 * @see #DEFAULT_QUOTE_CHARACTER
119 */
120 public final void setQuoteCharacter(char quoteCharacter) {
121 this.quoteCharacter = quoteCharacter;
122 this.quoteString = "" + quoteCharacter;
123 }
124
125 /**
126 * Yields the tokens resulting from the splitting of the supplied
127 * <code>line</code>.
128 *
129 * @param line the line to be tokenized
130 *
131 * @return the resulting tokens
132 */
133 @Override
134 protected List<String> doTokenize(String line) {
135
136 List<String> tokens = new ArrayList<String>();
137
138 // line is never null in current implementation
139 // line is checked in parent: AbstractLineTokenizer.tokenize()
140 char[] chars = line.toCharArray();
141 boolean inQuoted = false;
142 int lastCut = 0;
143 int length = chars.length;
144 int fieldCount = 0;
145
146 for (int i = 0; i < length; i++) {
147
148 char currentChar = chars[i];
149 boolean isEnd = (i == (length - 1));
150
151 boolean isDelimiter = isDelimiter(chars, i, delimiter);
152
153 if ((isDelimiter && !inQuoted) || isEnd) {
154 int endPosition = (isEnd ? (length - lastCut) : (i - lastCut));
155
156 if (isEnd && isDelimiter) {
157 endPosition--;
158 }
159 else if (!isEnd){
160 endPosition = (endPosition - delimiter.length()) + 1;
161 }
162
163 if (includedFields == null || includedFields.contains(fieldCount)) {
164 String value = maybeStripQuotes(new String(chars, lastCut, endPosition));
165 tokens.add(value);
166 }
167
168 fieldCount++;
169
170 if (isEnd && (isDelimiter)) {
171 if (includedFields == null || includedFields.contains(fieldCount)) {
172 tokens.add("");
173 }
174 fieldCount++;
175 }
176
177 lastCut = i + 1;
178 }
179 else if (isQuoteCharacter(currentChar)) {
180 inQuoted = !inQuoted;
181 }
182
183 }
184
185 return tokens;
186 }
187
188 /**
189 * If the string is quoted strip (possibly with whitespace outside the
190 * quotes (which will be stripped), replace escaped quotes inside the
191 * string. Quotes are escaped with double instances of the quote character.
192 *
193 * @param string
194 * @return the same string but stripped and unescaped if necessary
195 */
196 private String maybeStripQuotes(String string) {
197 String value = string.trim();
198 if (isQuoted(value)) {
199 value = StringUtils.replace(value, "" + quoteCharacter + quoteCharacter, "" + quoteCharacter);
200 int endLength = value.length() - 1;
201 // used to deal with empty quoted values
202 if (endLength == 0) {
203 endLength = 1;
204 }
205 value = value.substring(1, endLength);
206 return value;
207 }
208 return string;
209 }
210
211 /**
212 * Is this string surrounded by quote characters?
213 *
214 * @param value
215 * @return true if the value starts and ends with the
216 * {@link #quoteCharacter}
217 */
218 private boolean isQuoted(String value) {
219 if (value.startsWith(quoteString) && value.endsWith(quoteString)) {
220 return true;
221 }
222 return false;
223 }
224
225 /**
226 * Is the supplied character the delimiter character?
227 *
228 * @param c the character to be checked
229 * @return <code>true</code> if the supplied character is the delimiter
230 * character
231 * @see DelimitedLineTokenizer#DelimitedLineTokenizer(char)
232 */
233 private boolean isDelimiter(char[] chars, int i, String token) {
234 boolean result = false;
235
236 if(i >= token.length()) {
237 String end = new String(chars, (i-token.length()) + 1, token.length());
238 if(token.equals(end)) {
239 result = true;
240 }
241 }
242
243 return result;
244 }
245
246 /**
247 * Is the supplied character a quote character?
248 *
249 * @param c the character to be checked
250 * @return <code>true</code> if the supplied character is an quote character
251 * @see #setQuoteCharacter(char)
252 */
253 protected boolean isQuoteCharacter(char c) {
254 return c == quoteCharacter;
255 }
256 }