我需要加载和使用C++中的CSV文件数据。 在这一点上,它实际上可以只是一个逗号分隔的解析器(即不用担心转义新的行和逗号)。 主要需要一个逐行解析器,它将在每次调用方法时为下一行返回一个向量。
我发现这篇文章看起来很有希望:http://www.boost.org/doc/libs/1_35_0/libs/spirit/example/judital/list_parser.cpp
我从未使用过Boost的精神,但愿意尝试一下。 但前提是没有更直接的解决方案。
如果您不关心逗号和换行符的转义,
并且您不能在引号中嵌入逗号和换行符(如果您不能转义,那么。。。)
那么它只有大约三行代码(OK 14->,但是读取整个文件只有15行)。
std::vector<std::string> getNextLineAndSplitIntoTokens(std::istream& str)
{
std::vector<std::string> result;
std::string line;
std::getline(str,line);
std::stringstream lineStream(line);
std::string cell;
while(std::getline(lineStream,cell, ','))
{
result.push_back(cell);
}
// This checks for a trailing comma with no data after it.
if (!lineStream && cell.empty())
{
// If there was a trailing comma then add an empty element.
result.push_back("");
}
return result;
}
我只需创建一个表示行的类。
然后流到该对象:
#include <iterator>
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <string>
class CSVRow
{
public:
std::string const& operator[](std::size_t index) const
{
return m_data[index];
}
std::size_t size() const
{
return m_data.size();
}
void readNextRow(std::istream& str)
{
std::string line;
std::getline(str, line);
std::stringstream lineStream(line);
std::string cell;
m_data.clear();
while(std::getline(lineStream, cell, ','))
{
m_data.push_back(cell);
}
// This checks for a trailing comma with no data after it.
if (!lineStream && cell.empty())
{
// If there was a trailing comma then add an empty element.
m_data.push_back("");
}
}
private:
std::vector<std::string> m_data;
};
std::istream& operator>>(std::istream& str, CSVRow& data)
{
data.readNextRow(str);
return str;
}
int main()
{
std::ifstream file("plop.csv");
CSVRow row;
while(file >> row)
{
std::cout << "4th Element(" << row[3] << ")\n";
}
}
但只需做一点工作,我们就可以从技术上创建一个迭代器:
class CSVIterator
{
public:
typedef std::input_iterator_tag iterator_category;
typedef CSVRow value_type;
typedef std::size_t difference_type;
typedef CSVRow* pointer;
typedef CSVRow& reference;
CSVIterator(std::istream& str) :m_str(str.good()?&str:NULL) { ++(*this); }
CSVIterator() :m_str(NULL) {}
// Pre Increment
CSVIterator& operator++() {if (m_str) { if (!((*m_str) >> m_row)){m_str = NULL;}}return *this;}
// Post increment
CSVIterator operator++(int) {CSVIterator tmp(*this);++(*this);return tmp;}
CSVRow const& operator*() const {return m_row;}
CSVRow const* operator->() const {return &m_row;}
bool operator==(CSVIterator const& rhs) {return ((this == &rhs) || ((this->m_str == NULL) && (rhs.m_str == NULL)));}
bool operator!=(CSVIterator const& rhs) {return !((*this) == rhs);}
private:
std::istream* m_str;
CSVRow m_row;
};
int main()
{
std::ifstream file("plop.csv");
for(CSVIterator loop(file); loop != CSVIterator(); ++loop)
{
std::cout << "4th Element(" << (*loop)[3] << ")\n";
}
}
使用Boost标记器的解决方案:
std::vector<std::string> vec;
using namespace boost;
tokenizer<escaped_list_separator<char> > tk(
line, escaped_list_separator<char>('\\', ',', '\"'));
for (tokenizer<escaped_list_separator<char> >::iterator i(tk.begin());
i!=tk.end();++i)
{
vec.push_back(*i);
}
我的版本使用的只是标准的C++11库。 它很好地处理了Excel CSV报价:
spam eggs,"foo,bar","""fizz buzz"""
1.23,4.567,-8.00E+09
代码被写成一个有限状态机,一次消耗一个字符。 我觉得这更容易讲道理。
#include <istream>
#include <string>
#include <vector>
enum class CSVState {
UnquotedField,
QuotedField,
QuotedQuote
};
std::vector<std::string> readCSVRow(const std::string &row) {
CSVState state = CSVState::UnquotedField;
std::vector<std::string> fields {""};
size_t i = 0; // index of the current field
for (char c : row) {
switch (state) {
case CSVState::UnquotedField:
switch (c) {
case ',': // end of field
fields.push_back(""); i++;
break;
case '"': state = CSVState::QuotedField;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedField:
switch (c) {
case '"': state = CSVState::QuotedQuote;
break;
default: fields[i].push_back(c);
break; }
break;
case CSVState::QuotedQuote:
switch (c) {
case ',': // , after closing quote
fields.push_back(""); i++;
state = CSVState::UnquotedField;
break;
case '"': // "" -> "
fields[i].push_back('"');
state = CSVState::QuotedField;
break;
default: // end of quote
state = CSVState::UnquotedField;
break; }
break;
}
}
return fields;
}
/// Read CSV file, Excel dialect. Accept "quoted fields ""with quotes"""
std::vector<std::vector<std::string>> readCSV(std::istream &in) {
std::vector<std::vector<std::string>> table;
std::string row;
while (!in.eof()) {
std::getline(in, row);
if (in.bad() || in.fail()) {
break;
}
auto fields = readCSVRow(row);
table.push_back(fields);
}
return table;
}