package net.quies.tech.csv; /* Copyright (c) 2006, 2007 Pascal S. de Kloe. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ import java.io.*; /** * Parses CSV from a stream. @author Pascal S. de Kloe @since 1.0 */ public final class CSVReader { /** @param source the CSV stream. The use of a {@link java.io.BufferedReader} is recommended @since 1.0 */ public CSVReader(Reader source) { this(source, 80); } /** @param source the CSV stream. The use of a {@link java.io.BufferedReader} is recommended @param bufferSize the initial capacity of the {@code StringBuffer} used for each field. The default is 80. @since 1.0 */ public CSVReader(Reader source, int bufferSize) { if (source == null || bufferSize < 0) throw new IllegalArgumentException(); in = source; BUFFER_SIZE = bufferSize; } /** * Gets the zero-based record index which is equal to the number of read records. @since 1.0 */ public long getRecordIndex() { return record; } /** * This method is equal to {@link #readRecord(int)}. @since 1.0 */ public String[] readHeader(int names) throws IOException, WrongFieldsException, RFC4180Violation { return readRecord(names); } /** * Parses the current record in the stream to a array of fields. @param fields the number of fields in a record. @return a array of fields or {@code null} if the end of the stream has been reached. @throws IOException if the {@code Reader} caused any troubles. * This leaves the stream in a undefined state. @throws WrongFieldsException if the number of fields in the record is not equal to {@code fields}. * This leaves the stream in a undefined state. @throws RFC4180Violation if the current record in the stream is not conform RFC 4180. * This leaves the stream in a undefined state. @since 1.0 */ public String[] readRecord(int fields) throws IOException, WrongFieldsException, RFC4180Violation { if (fields <= 0) throw new IllegalArgumentException("A record must contain at least one field."); int c = in.read(); if (c < 0) return null; if (c == '"') return escaped(new String[fields], 0); else return nonEscaped(new String[fields], 0, c); } private String[] escaped(String[] field, int fieldIndex) throws IOException, WrongFieldsException, RFC4180Violation { StringBuffer buffer = new StringBuffer(BUFFER_SIZE); while (true) { int c = in.read(); if (c == DQUOTE) { c = in.read(); if (c != DQUOTE) { if (c == COMMA) return comma(field, fieldIndex, buffer); if (c == CR) return crlf(field, fieldIndex, buffer); if (c < 0) return end(field, fieldIndex, buffer); buildRFC4180Violation(c, "after a escaped field"); } } else if (c < 0x20 && c != CR && c != LF) buildRFC4180Violation(c, "in a escaped field"); buffer.append((char) c); } } private String[] nonEscaped(String[] field, int fieldIndex, int c) throws IOException, WrongFieldsException, RFC4180Violation { StringBuffer buffer = new StringBuffer(BUFFER_SIZE); while (true) { if (c == COMMA) return comma(field, fieldIndex, buffer); if (c < 0x20 || c == DQUOTE) { if (c == CR) return crlf(field, fieldIndex, buffer); if (c < 0) return end(field, fieldIndex, buffer); buildRFC4180Violation(c, "in a non-escaped field"); } buffer.append((char) c); c = in.read(); } } private static void buildRFC4180Violation(int illegalCharacter, String location) throws RFC4180Violation { String message = "Illegal character 0x" + Integer.toHexString(illegalCharacter).toUpperCase() + ' ' + location + '.'; throw new RFC4180Violation(message); } private String[] comma(String[] field, int fieldIndex, StringBuffer buffer) throws IOException, WrongFieldsException, RFC4180Violation { field[fieldIndex] = buffer.toString(); if (++fieldIndex == field.length) throw new WrongFieldsException(); int c = in.read(); if (c == '"') return escaped(field, fieldIndex); else return nonEscaped(field, fieldIndex, c); } private String[] crlf(String[] field, int fieldIndex, StringBuffer buffer) throws IOException, WrongFieldsException, RFC4180Violation { if (in.read() != LF) throw new RFC4180Violation("missing LF afther CR"); return end(field, fieldIndex, buffer); } private String[] end(String[] field, int fieldIndex, StringBuffer buffer) throws WrongFieldsException { field[fieldIndex] = buffer.toString(); if (++fieldIndex != field.length) throw new WrongFieldsException(); ++record; return field; } private static final int LF = (int) '\n'; private static final int CR = (int) '\r'; private static final int DQUOTE = (int) '"'; private static final int COMMA = (int) ','; private final Reader in; private final int BUFFER_SIZE; private long record = 0L; }