Difference between revisions of "CSV"
(Created page with "==Comma separated values (CSV)== This is a standard record-based file structure for containing textual data. Each value is separated by a comma, and the end of record is deno...") |
|||
| Line 1: | Line 1: | ||
| + | [[Category:Java]] | ||
| + | [[Category:Example code]] | ||
==Comma separated values (CSV)== | ==Comma separated values (CSV)== | ||
Latest revision as of 06:53, 12 April 2016
Contents
Comma separated values (CSV)
This is a standard record-based file structure for containing textual data. Each value is separated by a comma, and the end of record is denoted by the end of the line. More information can be found on the wikipedia page.
One of the widely adopted standards is described in RFC-4180.
Parsing CSV in Linux
There is a nifty tool called csvtool, that can be installed on Ubuntu using the following command:
sudo apt-get install csvtool
This example will extract the first and third columns from a CSV file:
csvtool col "1,3" example.csv
You could also use pipes to make csvtool read its input from stdin (by supplying "-" as the filename):
cat example.csv | csvtool col "1,3" -
There is much more that the tool can do. See the help information:
csvtool --help
Java source code
CSVReader
This class allows you to read a CSV source properly. You should use a BufferedReader where possible to speed things up.
package com.metapack.common.text.csv;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/**
* CSVReader
*
* This class will read a CSV file from the provided reader using the RFC-4180 standard.
*
* Capabilities in a nut shell:
*
* 1) Handles quoted values
* 2) Handles multi-line quoted values
* 3) Handles double-quote escaping through doubles.
* e.g.
* Paul said, "Hi".
* would be encoded as:
* "Paul said, ""Hi""."
*
*
* See: http://tools.ietf.org/html/rfc4180
*
* @author apowney
*
*/
public class CSVReader {
private Reader mReader;
private static int RECORD_TERMINATOR = '\n';
private static int FIELD_DELIMITER = ',';
private boolean mTrimWhitespace = true;
/**
* Create a CSVReader around the provided reader. It is recommended that a BufferedReader be used here.
*
* @param reader The reader where the source input is coming from.
*/
public CSVReader(Reader reader) {
mReader = reader;
}
/**
* This provides access to the base reader used during initialisation of this object. Manipulation of the internal reader, though not recommended, is supported.
*
* @return The reader where the input source is coming from.
*/
public Reader getInternalReader() {
return mReader;
}
/**
* Read a record (usually a line) from the input, returning an array of each value. After the last record has been read, this will return null.
*
* @return An array containing the values, or null if there are no more records to read.
* @throws IOException if an error occurs.
*/
public List<String> readRecord() throws IOException {
ArrayList<String> values = new ArrayList<String>();
int ch = mReader.read();
while( isWhitespace( ch ) ) {
ch = mReader.read();
}
if( ch == -1 ) {
// We have read the last record, so we will now return null.
return null;
}
while( ch != -1 ) {
// Skip any whitespace found here
while( isWhitespace( ch ) ) {
ch = mReader.read();
}
if( ch == RECORD_TERMINATOR ) {
// Genuine record-terminator reached
if( values.size() == 0 ) {
// This is the special case where we handle a blank line. There is a single value, and it's blank. An empty array is wrong.
values.add( "" );
}
break;
} else if( ch == '\"' ) {
// Read the quote-terminated string (double the quotes to escape them)
StringBuilder sb = new StringBuilder();
int termChar = ch;
int lastChar = 0;
lastChar = mReader.read();
ch = mReader.read();
while( ch != -1 ) {
if( lastChar == termChar && ch == termChar ) {
sb.appendCodePoint( termChar );
ch = mReader.read();
} else if( lastChar == termChar ) {
break;
} else {
sb.appendCodePoint( lastChar );
}
lastChar = ch;
ch = mReader.read();
}
if( ch == termChar ) {
ch = mReader.read();
} else if( ch == -1 ){
// We reach here if the document has finished but the quoted text is still open (but we need to add the last character). i.e. String not terminated
if(lastChar != termChar)
sb.appendCodePoint( lastChar );
}
// Skip any trailing whitespace
while( isWhitespace( ch ) ) {
ch = mReader.read();
}
// Skip the field delimiter if there is one
if( ch == FIELD_DELIMITER ) {
ch = mReader.read();
}
values.add( sb.toString() );
} else {
// Read a normal value
StringBuilder sb = new StringBuilder();
while( ch != -1 && ch != RECORD_TERMINATOR && ch != FIELD_DELIMITER ) {
sb.appendCodePoint( ch );
ch = mReader.read();
}
if( isTrimWhitespace() ) {
values.add( sb.toString().trim() );
} else {
values.add( sb.toString() );
}
// Skip the field delimiter if there is one
if( ch == FIELD_DELIMITER ) {
ch = mReader.read();
}
}
}
return values;
}
/**
* For normal values (those not inside quotes), it is often necessary to omit leading and trailing whitespace. By default, this option is turned on.
*
* @return True if leading and trailing whitespace should be trimmed.
*/
public boolean isTrimWhitespace() {
return mTrimWhitespace;
}
/**
* For normal values (those not inside quotes), it is often necessary to omit leading and trailing whitespace. By default, this option is turned on.
*
* @param trimWhitespace True if leading and trailing whitespace should be trimmed. Default is "true".
*/
public void setTrimWhitespace( boolean trimWhitespace ) {
mTrimWhitespace = trimWhitespace;
}
/**
* Override this method if you want to treat certain characters as whitespace. The default implementation checks space, tab, and carriage-return. Line-feed
* is the record terminator.
*
* @param ch The character to test.
* @return True if the character is whitespace, false otherwise.
*/
protected boolean isWhitespace(int ch) {
return ch == ' ' || ch == '\t' || ch == '\r';
}
}
CSVWriter
This class handles the writing of records to a CSV file, ensuring values are properly escaped. Consider using a BufferedWriter to ensure things are speedy.
package com.metapack.common.text.csv;
import java.io.IOException;
import java.io.Writer;
/**
* CSVWriter
*
* This class will write a CSV file to the provided writer using the RFC-4180 standard.
*
* Remember: After writing the file, you should perform a flush() on your writer.
*
* @author apowney
*
*/
public class CSVWriter {
private Writer mWriter;
private boolean mFirstValue = true;
/**
* Create a new CSVWriter to the designated target writer.
*
* @param writer
*/
public CSVWriter(Writer writer) {
mWriter = writer;
}
/**
* This provides access to the base writer used during initialisation of this object. Manipulation of the internal writer, though not recommended, is supported.
*
* @return The writer where the output source is going to.
*/
public Writer getInternalWriter() {
return mWriter;
}
/**
* Start a new record. You cannot use this to create empty records (you will need to add an empty value).
*
* @throws IOException if an error occurs.
*/
public void newRecord() throws IOException {
if( ! mFirstValue ) {
mWriter.write( getRecordTerminator() );
mFirstValue = true;
}
}
/**
* Writes an object to the CSV file. This makes use of the toString() of the object to work out its representation. Null values are treated as empty strings. You can
* ensure that all values are quoted (other than null ones) if you wish. However, this will attempt to automatically work out if terminator quotes are required.
*
* @param value The value (or null) to write.
* @param alwaysQuote Put double-quotes around the value.
* @throws IOException if an error occurs.
*/
public void write(Object value,boolean alwaysQuote) throws IOException {
if( ! mFirstValue ) {
mWriter.write( "," );
}
if( value == null ) {
mWriter.write( "" );
} else {
String csvValue = value.toString();
boolean quotesNeeded = alwaysQuote;
if( ! quotesNeeded && csvValue.indexOf( ',' ) != -1 ) {
quotesNeeded = true;
}
if( ! quotesNeeded && csvValue.indexOf( '\"' ) != -1 ) {
quotesNeeded = true;
}
if( ! quotesNeeded && csvValue.indexOf( '\'' ) != -1 ) {
quotesNeeded = true;
}
if( ! quotesNeeded && csvValue.indexOf( '\n' ) != -1 ) {
quotesNeeded = true;
}
if( quotesNeeded ) {
mWriter.write( "\"" );
}
mWriter.write( csvValue.replace( "\"" , "\"\"" ) );
if( quotesNeeded ) {
mWriter.write( "\"" );
}
}
mFirstValue = false;
}
/**
* Override this method if you want to put a record terminator other than CR LF.
*
* @return The string to use as a record terminator.
*/
protected String getRecordTerminator() {
return "\r\n";
}
}
Test application
This class shows how to use the CSVReader and CSVWriter classes in an application. It also serves to prove the functionality for testing purposes.
package com.metapack.common.text.csv;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import java.util.List;
public class Test {
public static void testReader() {
String testString = "This,is a,test to,\"see if it's\", working , \"paul said, \"\"he\r\nllo\"\" to John.\"\r\n1,2,3\nA,B,C\n123\"456\n1,,2,3\n\"\",X\n,1,\"2\",A,\"not terminated";
CSVReader csvReader = new CSVReader( new StringReader( testString ) );
List<String> values = null;
try {
int row = 0;
while( ( values = csvReader.readRecord() ) != null ) {
row++;
int column = 0;
for( String value : values ) {
column++;
System.out.println( "R" + row + "C" + column + ": " + value );
}
}
} catch( IOException e ) {
e.printStackTrace();
}
}
public static void testWriter() {
try {
CSVWriter csvWriter = new CSVWriter( new FileWriter( "c:/test.csv" ) );
csvWriter.newRecord();
csvWriter.write( "Text" , false );
csvWriter.write( "Text" , true );
csvWriter.newRecord();
csvWriter.write( "Te,xt" , false );
csvWriter.write( "Te,xt" , true );
csvWriter.newRecord();
csvWriter.write( "Te\"xt" , false );
csvWriter.write( "Te\"xt" , false );
csvWriter.newRecord();
csvWriter.write( "Te\nxt" , false );
csvWriter.write( "Te\nxt" , false );
csvWriter.newRecord();
csvWriter.newRecord();
csvWriter.write( "" , false );
csvWriter.newRecord();
csvWriter.getInternalWriter().flush();
} catch( IOException e ) {
e.printStackTrace();
}
}
public static void main(String[] args) {
testReader();
}
}