Thursday, April 2, 2009

Import Excel sheet to Data Table

using System;
using System.ComponentModel;
using System.Data;
using System.Data.OleDb;
using System.Text;


public class ExcelBase : Component, IDisposable
{
#region Constructors
public ExcelBase()
{
UseFinalizer = false;
}
public ExcelBase(string WorkBook)
: this()
{
this.WorkBook = WorkBook;
}
#endregion
#region Workbook/range settings
string workbook;
/// &ltsummary>
/// The workbook (file) name to query
///
[DefaultValue(null)]
public string WorkBook
{
get { return workbook; }
set
{
CloseConnection();
workbook = value;
determinedrange = null;
}
}
/// &ltsummary>
/// The Range which to query. This can be any Excel range (eg &quotA1:B5") or
/// just a worksheet name.
/// If this value is null, the first sheet of the &ltsee cref=&quotWorkBook"/> is used
///
private string range;
[DefaultValue(null)]
public string Range
{
get { return range; }
set
{
range = value;
determinedrange = null;
}
}
private int worksheetindex = 0;
/// &ltsummary>
/// The 0 based INDEX of the worksheet to query.
/// If you want to set the name of the worksheet, use &ltsee cref=&quotRange"/> instead.
/// NB: if &ltsee cref=&quotRange"/> is set, this property is ignored
///
[DefaultValue(0)]
public int WorkSheetIndex
{
get { return worksheetindex; }
set
{
worksheetindex = value;
determinedrange = null;
}
}
#region Range formatting
/// &ltsummary>
/// If a range was determined in a previous step, keep it buffered here
///
string determinedrange;
/// &ltsummary>
/// Gets the properly formatted sheet name
/// if no worksheet was provided, read out sheet information and select everything
/// from the first sheet
///
public string GetRange()
{
if (determinedrange == null)
{
string range = Range;
if (range == null)
range = DetermineRange();
if (range.IndexOf(':') == -1 && !range.EndsWith("$"))
range += "$"; //sheetname has to be appended with a $
determinedrange = "[" + range + "]";
}
return determinedrange;
}

/// &ltsummary>
/// See &ltsee cref=&quotAutoDetermineRange"/> property for more info
///
/// &ltreturns>
string DetermineRange()
{
string sheet = GetSheetName(worksheetindex);
if (!autodeterminerange) return sheet;
return new RangeFinder(this, sheet).ToString();
}
#region RangeFinder
class RangeFinder
{
OleDbDataAdapter da;
DataTable dtSchema;
ExcelDataRange rng = new ExcelDataRange();
Import eb;
int cols;
/// &ltsummary>
/// minimum amount of columns that need to be filled
/// &ltseealso cref=&quotminfilled"/>
///
int min;
public RangeFinder(ExcelBase Owner, string sheet)
{
this.eb = new Import(Owner.WorkBook);
eb.Range = sheet;
eb.UseHeaders = false;
eb.InterMixedAsText = true;
//DataTable dt = eb.Query();
try
{
eb.OpenConnection();
//get the number of rows and columns
da = new OleDbDataAdapter("select * from [" + sheet + "]", eb.Connection);
dtSchema = new DataTable();
da.FillSchema(dtSchema, SchemaType.Source);
cols = dtSchema.Columns.Count;
int rows = (int)ExecuteScalar("select count(*) from [" + sheet + "]");
//fill the range object
rng.From.Row = rng.From.Column = 1;
rng.To.Row = rows;
rng.To.Column = cols;

min = (int)(cols * minfilled);
//now rng contains the complete square range of data containing cells
//try to narrow it by getting as much hits as possible
DecreaseRange();
}
finally
{
indexReader.Close();
eb.CloseConnection();
}
}
object ExecuteScalar(string sql)
{
return new OleDbCommand(sql, da.SelectCommand.Connection).ExecuteScalar();
}

string indexquery;
string GetIndexQuery()
{
if (indexquery == null)
{
StringBuilder sql = new StringBuilder("select 0");
int i = 0;
foreach (DataRow dr in dtSchema.Rows)
{
string colname = "[" + dr["column_name"].ToString() + "]";
sql.Append("+iif(").Append(colname).Append(" is null,0,1)");
}
sql.Append(" as ind from ");
indexquery = sql.ToString();
}
return indexquery;
}
//ExcelDataRange indexRange;
DataTable indexTable = new DataTable();
OleDbDataReader indexReader;
int GetIndex()
{
if (!Forward)
{
indexReader.Close();
indexReader = null;
da.SelectCommand.CommandText = string.Format(" select * from {0}:{0}"
, rng.To.Row);
}
if (indexReader == null)
indexReader = da.SelectCommand.ExecuteReader();
int cnt = 0;
if (!indexReader.Read()) return -1;
for (int i = 0; i < indexReader.FieldCount; i++)
{
if (!indexReader.IsDBNull(i)) cnt++;
}
return cnt;
da.TableMappings.Clear();


da = new OleDbDataAdapter(da.SelectCommand.CommandText, eb.conn);
indexTable = new DataTable();
//da.FillSchema(indexTable, SchemaType.Source);
da.Fill(indexTable);
return indexTable.Columns.Count;
}
/// &ltsummary>
/// minimum percentage that needs to be filled to count as a datarow
///
const double minfilled = .75;
/// &ltsummary>
/// The amount of subsequent (or preceding) rows that need to be filled a &ltsee cref=&quotminfilled"/> percentage
/// for it to count as a datarow
///
const int CheckRows = 3;
/// &ltsummary>
/// Decrease the range step by step
/// The problem is that when obtaining all, a lot more nulls are returned
/// than you would visibly see. That makes most algorithms to get the
/// block useless.
/// this is also why just obtaining the datatable complete and removing the
/// rows will not suffice: the proper field data types will not have been set
/// Best way I could figure without using interop was to increase the start
/// range to see if the avarage filled values increase.
///
void DecreaseRange()
{
for (; ; )
{
if (GetIndex() >= min)
{
int i = 0;
for (; i < CheckRows; i++)
{
AlterRange(1);
if (GetIndex() < min)
{
break;
}
}
if (i == CheckRows)
{
AlterRange(-i);
if (Forward)
Forward = false;
else
break;
}
}
if (rng.From.Row > rng.To.Row)
throw new Exception("Could not determine data range");
AlterRange(1);
}
}
bool Forward = true;
void AlterRange(int i)
{
if (Forward)
rng.From.Row += i;
else
rng.To.Row -= i;
}

public override string ToString()
{
return rng.ToString();
}
struct ExcelRange
{
public int Row, Column;
public ExcelRange(int Col, int Row)
{
this.Column = Col;
this.Row = Row;
}
public override string ToString()
{
//return string.Format(&quotR{0}C{1}", Row, Column);
string res = Row.ToString();
int col = Column;
while (col > 0)
{
int cc = col % 26;
char c = (char)('A' + cc - 1);
res = c.ToString() + res;
col /= 26;
}
return res;
}
}
struct ExcelDataRange
{
public ExcelRange
From, To;
public override string ToString()
{
return GetRange(From, To);
}
static string GetRange(ExcelRange from, ExcelRange to)
{
return from.ToString() + ":" + to.ToString();
}
public string TopRow()
{
return GetRange(From, new ExcelRange(To.Column, From.Row));
}
public string BottomRow()
{
return GetRange(new ExcelRange(From.Column, To.Row), To);
}
}
}
#endregion
#endregion

/// &ltsummary>
/// Checks if the &ltsee cref=&quotWorkBook"/> exists
///
public bool WorkBookExists
{
get { return System.IO.File.Exists(WorkBook); }
}
/// &ltsummary>
/// Checks if the workbook exists and throws an exception if it doesn't
/// &ltseealso cref=&quotWorkBookExists"/>
///
protected void CheckWorkbook()
{
if (!WorkBookExists) throw new System.IO.FileNotFoundException("Workbook not found", WorkBook);
}
#endregion
#region Connection
/// &ltsummary>
/// Creates &nbspa NEW connection. If this method is called directly, this
/// class will not check if it is closed.
/// To get a handled connection, use the &ltsee cref=&quotConnection"/> property.
///
/// &ltreturns>
public OleDbConnection CreateConnection()
{
CheckWorkbook();
return new OleDbConnection(
string.Format("Provider=Microsoft.Jet.OLEDB.4.0;" +
"Data Source={0};Extended Properties='Excel 8.0;HDR={1};Imex={2}'",
WorkBook, useheaders ? "Yes" : "No",
imex ? "1" : "0")
);
}
private bool useheaders = true;
/// &ltsummary>
/// Determines if the first row in the specified &ltsee cref=&quotRange"/> contains the headers
///
[DefaultValue(true)]
public bool UseHeaders
{
get { return useheaders; }
set
{
if (useheaders != value)
{
CloseConnection();
useheaders = value;
}
}
}
private bool imex;
/// &ltsummary>
/// if this value is &ltc>true, 'intermixed' data columns are handled as text (otherwise Excel tries to make a calcuated guess on what the datatype should be)
///
[DefaultValue(false)]
public bool InterMixedAsText
{
get { return imex; }
set
{
if (imex != value)
{
CloseConnection();
imex = value;
}
}
}
private bool autodeterminerange;
/// &ltsummary>
/// Tries to obtain the range automatically by looking for a large chunk of data. Use this value if there's a lot of
/// static around the actual data.
/// Beware though: this takes some additional steps and can cause performance loss
/// when querying larger files.
/// automatically determening the range is not fullproof. Be sure to check the results
/// on first time use.
/// NB: if the &ltsee cref=&quotRange"/> is set, this property is ignored.
///
[DefaultValue(false)]
public bool AutoDetermineRange
{
get { return autodeterminerange; }
set
{
if (autodeterminerange != value)
{
autodeterminerange = value;
determinedrange = null;
}
}
}
OleDbConnection conn;
/// &ltsummary>
/// Gets a connection to the current &ltsee cref=&quotWorkBook"/>
/// When called for the first time (or after changing the workbook)
/// a new connection is created.
/// To close the connection, preferred is the use of &ltsee cref=&quotCloseConnection"/>
///
public OleDbConnection Connection
{
get
{
if (conn == null)
{
conn = CreateConnection();
UseFinalizer = true;
}
return conn;
}
}
/// &ltsummary>
/// Closes the connection (if open)
///
public void CloseConnection()
{
if (conn != null && ConnectionIsOpen)
conn.Dispose();
conn = null;
UseFinalizer = false;
}
protected void CloseConnection(bool OnlyIfNoneOpen)
{
if (OnlyIfNoneOpen)
{
if (--opencount > 0 || wasopenbeforerememberstate) return;
}
CloseConnection();
}
/// &ltsummary>
/// Opens the &ltsee cref=&quotConnection"/>
///
public void OpenConnection()
{
OpenConnection(false);
}
int opencount;
bool wasopenbeforerememberstate;
protected void OpenConnection(bool RememberState)
{
if (RememberState && opencount++ == 0) wasopenbeforerememberstate = ConnectionIsOpen;
if (!ConnectionIsOpen)
Connection.Open();
}
public bool ConnectionIsOpen
{
get { return conn != null && conn.State != ConnectionState.Closed; }
}

#endregion
#region IDisposable Members
public void Dispose()
{
CloseConnection();
}
~ExcelBase()
{
Dispose();
}
private bool usefinalizer;
bool UseFinalizer
{
get { return usefinalizer; }
set
{
if (usefinalizer == value) return;
usefinalizer = value;
if (value)
GC.ReRegisterForFinalize(this);
else
GC.SuppressFinalize(this);
}
}
#endregion
#region Helper functions
/// &ltsummary>
/// queries the connection for the sheetnames and returns them
///
/// &ltreturns>
public string[] GetSheetNames()
{
OpenConnection(true);
try
{
// Read out sheet information
DataTable dt = Connection.GetOleDbSchemaTable(OleDbSchemaGuid.Tables, null);
if (dt == null || dt.Rows.Count == 0)
{
throw new Exception("Could not get sheet names");
}

string[] res = new string[dt.Rows.Count];
for (int i = 0; i < res.Length; i++)
{
string name = dt.Rows[i]["TABLE_NAME"].ToString();

if (name[0] == '\'')
{
//numeric sheetnames get single quotes around them in the schema.
//remove them here
if (System.Text.RegularExpressions.Regex.IsMatch(
name, @"^'\d\w+\$'$"))
name = name.Substring(1, name.Length - 2);
}
res[i] = name;
}
return res;
}
finally
{
CloseConnection(true);
}
}
/// &ltsummary>
/// Gets the name of the first sheet
/// (this is also the default range used, when no &ltsee cref=&quotRange"/> is specified)
///
/// &ltreturns>
public string GetFirstSheet()
{
return GetSheetName(0);
}
public string GetSheetName(int index)
{
string[] sheets = GetSheetNames();
if (index <>= sheets.Length)
throw new IndexOutOfRangeException("No worksheet exists at the specified index");
return sheets[index];
}
#endregion
}
public class Import : ExcelBase
{
#region Static query procedures
/// &ltsummary>
/// Imports the first worksheet of the specified file
///
/// &ltparam name=&quotFile">
public static DataTable Query(string File)
{
return Query(File, null);
}
/// &ltsummary>
/// Imports the specified sheet in the specified file
///
/// &ltparam name=&quotFile">
/// &ltparam name=&quotRange">The worksheet or excel range to query
/// &ltreturns>
public static DataTable Query(string File, string Range)
{
return new Import(File, Range).Query();
}
public static DataTable Select(string File, string Sql)
{
Import i = new Import(File);
i.SQL = Sql;
return i.Query();
}
#endregion

#region Constructors
public Import() { }
public Import(string WorkBook) : base(WorkBook) { }
public Import(string WorkBook, string Range)
: this(WorkBook)
{
this.Range = Range;
}
#endregion

#region SQL Query
private string fields = "*";
/// &ltsummary>
/// The fields which should be returned (default all fields with data: "*")
///
[DefaultValue("*")]
public string Fields
{
get { return fields; }
set { fields = value; }
}
void ResetFields()
{
fields = "*";
}
private string where;
/// &ltsummary>
/// An optional where clause. Works pretty much the same as 'normal' SQL. (Default=null)
///
[DefaultValue(null)]
public string Where
{
get { return where; }
set { where = value; }
}
/// &ltsummary>
/// The sql to perform. If this value is filled, &ltsee cref=&quotWorkSheet"/> and &ltsee cref=&quotWhere"/> are ignored
///
public string SQL;
protected string GetSelectSQL()
{
if (SQL != null) return SQL;
// if no sql was provided, construct from worksheet and where
string sql = string.Format("select {0} from {1}", fields, GetRange());
if (where != null)
sql += " WHERE " + where;
return sql;
}
/// &ltsummary>
/// Performs the query with the specifed settings
///
/// &ltreturns>
public DataTable Query()
{
return Query((DataTable)null);
}
/// &ltsummary>
/// Same as &ltsee cref=&quotQuery()"/>, but an existing datatable is used and filled
/// (it will be your own responsibility to format the datatable correctly)
///
/// &ltparam name=&quotdt">
/// &ltreturns>
public DataTable Query(DataTable dt)
{
CheckWorkbook();
try
{
OpenConnection(true);
if (dt == null)
dt = new DataTable();
new OleDbDataAdapter(GetSelectSQL(), Connection).Fill(dt);
return dt;
}
finally
{
CloseConnection(true);
}
}
/// &ltsummary>
/// Fills the datatable with the results of the query
/// (wrapper around &ltsee cref=&quotQuery(DataTable)"/>)
///
/// &ltparam name=&quotdt">
public void Fill(DataTable dt)
{
Query(dt);
}
#endregion

}

No comments:

Post a Comment