<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:georss='http://www.georss.org/georss' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-1301643147176033927</id><updated>2012-02-01T10:27:02.254+01:00</updated><category term='images'/><category term='data integration'/><category term='extraction'/><category term='tools'/><category term='cli'/><category term='standardizing'/><category term='cleaner'/><category term='bug'/><category term='jndi'/><category term='create table'/><category term='development'/><category term='free'/><category term='winfried'/><category term='08'/><category term='community'/><category term='maven'/><category term='recognition'/><category term='proposal'/><category term='api'/><category term='add-on'/><category term='linkedin'/><category term='query'/><category term='motivation'/><category term='academia'/><category term='job'/><category term='heuristics'/><category term='styring'/><category term='tokens'/><category term='extension'/><category term='metaphone'/><category term='write'/><category term='fluent'/><category term='preprocessing'/><category term='2008'/><category term='table'/><category term='type parameter'/><category term='visualization'/><category term='names'/><category term='to'/><category term='java'/><category term='seam'/><category term='core'/><category term='example'/><category term='graphics'/><category term='transformation'/><category term='formatting'/><category term='hierarchy'/><category term='fetch'/><category term='similarity'/><category term='huge'/><category term='django'/><category term='mutable'/><category term='preview'/><category term='data profiling'/><category term='ui'/><category term='read'/><category term='interview'/><category term='push down'/><category term='report'/><category term='text'/><category term='first time right'/><category term='build'/><category term='optimization'/><category term='marketing'/><category term='large'/><category term='project'/><category term='open source data quality release announcement datacleaner 1.5.2'/><category term='blogging'/><category term='distinct'/><category term='van'/><category term='compiler'/><category term='exploration'/><category term='svn'/><category term='customer data'/><category term='flattening'/><category term='education'/><category term='thesis'/><category term='tango'/><category term='packaging'/><category term='flush'/><category term='unittest'/><category term='type-casting'/><category term='headless'/><category term='data warehousing'/><category term='openoffice'/><category term='data quality analysis'/><category term='rows'/><category term='architechture'/><category term='interface'/><category term='it'/><category term='excel'/><category term='flow'/><category term='data processing'/><category term='plugin'/><category term='metamodel'/><category term='planning'/><category term='user interface'/><category term='afløsningsopgave'/><category term='transformer'/><category term='regular'/><category term='image'/><category term='dansk'/><category term='scripts'/><category term='repository'/><category term='string analysis'/><category term='days'/><category term='manual'/><category term='promotion'/><category term='screen'/><category term='sas7bdat'/><category term='user experience'/><category term='speed'/><category term='master data management'/><category term='data format'/><category term='transaction'/><category term='pro'/><category term='engine'/><category term='xslt'/><category term='size'/><category term='meeting'/><category term='cube'/><category term='hudson'/><category term='danish'/><category term='human inference'/><category term='regex'/><category term='meta'/><category term='wikipedia'/><category term='weekdays'/><category term='sql'/><category term='identify'/><category term='awards'/><category term='data entry'/><category term='generated'/><category term='master'/><category term='analyzerbeans'/><category term='filtering'/><category term='data quality pro'/><category term='discussion'/><category term='standard measures'/><category term='display'/><category term='documentation'/><category term='swing'/><category term='unit'/><category term='profiler'/><category term='udvikling'/><category term='etl'/><category term='annotations'/><category term='finder'/><category term='ftr'/><category term='column'/><category term='functions'/><category term='open source'/><category term='iso date'/><category term='date'/><category term='insert'/><category term='jar'/><category term='presentation'/><category term='holland'/><category term='test'/><category term='choise'/><category term='css'/><category term='datasources'/><category term='intelligence'/><category term='ejb'/><category term='window'/><category term='humaninference'/><category term='tokenize'/><category term='performance'/><category term='group'/><category term='contact data'/><category term='notes'/><category term='multiple'/><category term='acquisition'/><category term='xml'/><category term='business'/><category term='select'/><category term='diy'/><category term='mysql'/><category term='ease of use'/><category term='lightning'/><category term='jdk'/><category term='sourceforge'/><category term='schema'/><category term='keynote'/><category term='look'/><category term='datastore'/><category term='explode'/><category term='date format'/><category term='maven2'/><category term='game'/><category term='olap'/><category term='great'/><category term='filter'/><category term='ear'/><category term='speak'/><category term='resultset'/><category term='frequency'/><category term='dqa'/><category term='book publish pentaho solutions open source business intelligence'/><category term='integration'/><category term='jpa'/><category term='explore'/><category term='persistence'/><category term='lgpl'/><category term='dependency'/><category term='quality'/><category term='screenshot'/><category term='result'/><category term='release'/><category term='detail'/><category term='jms'/><category term='verify'/><category term='sas'/><category term='prototype'/><category term='datasource'/><category term='value'/><category term='value distribution'/><category term='jdbc'/><category term='analyzer'/><category term='jfreechart'/><category term='board'/><category term='documents'/><category term='look and feel'/><category term='datacleaner'/><category term='fast'/><category term='benchmark'/><category term='business intelligence'/><category term='conference'/><category term='millions'/><category term='match'/><category term='evolution'/><category term='string'/><category term='n+1'/><category term='implement'/><category term='spreadsheet'/><category term='member'/><category term='python'/><category term='analysis'/><category term='javaone'/><category term='browser'/><category term='enterprise'/><category term='sassyreader'/><category term='domain'/><category term='open'/><category term='age'/><category term='csv'/><category term='rewriting'/><category term='phonetic'/><category term='file'/><category term='matching'/><category term='intranet'/><category term='database'/><category term='uispec4j'/><category term='linux'/><category term='recommendation'/><category term='hibernate'/><category term='dataprofiling'/><category term='data quality'/><category term='process'/><category term='mdx'/><category term='relational'/><category term='convert'/><category term='programming'/><category term='tutorial'/><category term='multithreading'/><category term='name'/><category term='goals'/><category term='expression'/><category term='mapping'/><category term='website'/><category term='chart'/><category term='levenshtein'/><category term='source'/><category term='expressions'/><category term='feature'/><category term='social construction'/><category term='soundex'/><category term='flushing'/><category term='datastores'/><category term='generics'/><category term='tool support'/><category term='dataqualitypro'/><category term='icon'/><category term='mondrian'/><category term='features'/><category term='jboss'/><category term='pattern'/><category term='eobjects'/><category term='article'/><category term='command line'/><category term='model'/><category term='data'/><category term='metadata'/><category term='drill'/><category term='profiling'/><category term='distribution'/><category term='profile'/><category term='discovery'/><title type='text'>kasper's source</title><subtitle type='html'>random thoughts, examples, tutorials and ideas on open source software, data quality, data warehousing, java programming, querying and more...</subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default?max-results=100'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>82</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>100</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-446721981873698619</id><published>2012-01-23T21:42:00.000+01:00</published><updated>2012-01-23T21:42:22.959+01:00</updated><title type='text'>Now you can build your own DQ monitoring solution with DataCleaner</title><content type='html'>In the cover of night we've released a new version of &lt;a href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt; today (version 2.4.2). Officially it's a minor release because for the User Interface very few things have changed, only a few bugfixes and minor enhancements have been introduced. But one potentially &lt;i&gt;major feature&lt;/i&gt; have been added in the inner workings of DataCleaner: The ability to &lt;b&gt;persist the results of your DQ analysis jobs&lt;/b&gt;. Although this feature still has very limited User Interface support, it has full support in the command line interface, which I would argue is actually sufficient for the purposes of establishing a&amp;nbsp;&lt;b&gt;data quality monitoring&lt;/b&gt; solution. Later on I do expect there to be full (and backwards compatible) support in the UI as well.&lt;br /&gt;&lt;br /&gt;So what is it, and how does it work?&lt;br /&gt;Well basically it is simply two new parameters to the command line interface:&lt;br /&gt;&lt;pre&gt; -of (--output-file) FILE                          : File in which to save the result of the job&lt;br /&gt; -ot (--output-type) [TEXT | HTML | SERIALIZED]    : How to represent the result of the job&lt;/pre&gt;&lt;p&gt;Here's an example of how to use it. Notice that I use the file extension &lt;b&gt;.analysis.result.dat&lt;/b&gt;, which is the one thing that is currently implemented and recognized in the UI as a result file.&lt;/p&gt;&lt;pre&gt;&lt;br /&gt;&amp;gt; DataCleaner-console.exe -job examples\employees.analysis.xml\&lt;br /&gt; -ot SERIALIZED\&lt;br /&gt; -of employees.analysis.result.dat&lt;/pre&gt;&lt;p&gt;Now start up DataCleaner's UI, and select "&lt;i&gt;File -&gt; Open analysis job...&lt;/i&gt;" - you'll suddenly see that the produced file can be opened:&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-bvkpvoRtC9Q/Tx3EcHDZAXI/AAAAAAAAAcg/XpuJyHsMUAM/s1600/dc.analysis.result.filechooser.png" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="290" width="320" src="http://1.bp.blogspot.com/-bvkpvoRtC9Q/Tx3EcHDZAXI/AAAAAAAAAcg/XpuJyHsMUAM/s320/dc.analysis.result.filechooser.png" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;And when you open the file, the result will be displayed just like a job you've run inside the application:&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-vaRXyaiyV8M/Tx3ExHLKx7I/AAAAAAAAAcs/2b3m5a4idVw/s1600/dc.analysis.result.chart.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="270" width="320" src="http://2.bp.blogspot.com/-vaRXyaiyV8M/Tx3ExHLKx7I/AAAAAAAAAcs/2b3m5a4idVw/s320/dc.analysis.result.chart.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;Since files like this are generally easy to archive and to append eg. timestamps etc., it should be really easy to build a DIY data quality monitoring solution based scheduled jobs and this approach to execution. Or you can get in contact with Human Inference if you want something more sophisticated ;-)&lt;/p&gt;&lt;p&gt;Notice also that there's a HTML output type, which is also quite neat and easy to parse with an XML parser. The SERIALIZED format is more rich though, and includes information needed for more refined, programmatic access to the results. For instance, you might deserialize the whole file using the regular Java serialization API and access it, as an &lt;a href="http://analyzerbeans.eobjects.org/apidocs/org/eobjects/analyzer/result/AnalysisResult.html" target="_blank"&gt;AnalysisResult&lt;/a&gt; instance. Thereby you could eg. create a timeline of a particular metric and track changes to the data that you are monitoring.&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-446721981873698619?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/446721981873698619/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=446721981873698619' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/446721981873698619'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/446721981873698619'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2012/01/now-you-can-build-your-own-dq.html' title='Now you can build your own DQ monitoring solution with DataCleaner'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/-bvkpvoRtC9Q/Tx3EcHDZAXI/AAAAAAAAAcg/XpuJyHsMUAM/s72-c/dc.analysis.result.filechooser.png' height='72' width='72'/><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5302041660818199228</id><published>2011-12-23T16:11:00.000+01:00</published><updated>2011-12-23T20:58:04.198+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sql'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='etl'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='push down'/><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><category scheme='http://www.blogger.com/atom/ns#' term='jdbc'/><category scheme='http://www.blogger.com/atom/ns#' term='flow'/><title type='text'>Push down query optimization in DataCleaner</title><content type='html'>&lt;p&gt;As a follow-up to my previous post about how we make DataCleaner super-fast by applying some nice multi-threading tricks, &lt;a href="http://kasper.eobjects.org/2011/11/datacleaner-engine-explained.html"&gt;The DataCleaner engine explained&lt;/a&gt;, I would now like to touch upon another performance booster: &lt;b&gt;Push down query optimization&lt;/b&gt;.&lt;/p&gt;&lt;p&gt;To my knowledge "push down query optimization" is a trick that only very few tools support, since it requires a flow model that was actually built for it. The idea is that by inspecting an execution flow the tool might be able to identify steps in the beginning or in the end of the flow that can be replaced by query modifications.&lt;/p&gt;&lt;p&gt;For example, if your data flow begins with a filtering action that removes all records of a given type or restricts the further processing to only be the first 1000 records or something like that. Most tools simply require you to write some SQL yourself, which is also doable, but as I've said before on this blog, I think &lt;a href="http://kasper.eobjects.org/2011/09/data-profiling-sqlized-uh-oh.html"&gt;writing SQL is a barrier to productivity, creativity and good data quality results&lt;/a&gt;. So in DataCleaner we do not offer this option, because we have something that is &lt;i&gt;much, much nicer&lt;/i&gt;. That solution is push down query optimization!&lt;/p&gt;&lt;p&gt;Let me illustrate. I will be using the &lt;a target="_blank" href="http://dev.mysql.com/doc/sakila/en/sakila.html#sakila-installation"&gt;Sakila example database for MySQL&lt;/a&gt;:&lt;/p&gt;&lt;p&gt;Say you want to do a simple pattern finding of film titles in the Sakila database, you would select the title column and you would get a result like this:&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-Kohc9pltOJ4/TvSSQVqCZsI/AAAAAAAAALU/LshXyh5FeJw/s1600/pushdown0.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="222" width="320" src="http://2.bp.blogspot.com/-Kohc9pltOJ4/TvSSQVqCZsI/AAAAAAAAALU/LshXyh5FeJw/s320/pushdown0.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;In the DataCleaner logs we can see what queries are actually fired to the database. Open up the log file (in the &lt;i&gt;logs&lt;/i&gt; folder) and inspect &lt;i&gt;datacleaner.log&lt;/i&gt;. You will find a line like this:&lt;/p&gt;&lt;blockquote&gt;Executing query: SELECT `nicer_but_slower_film_list`.`title` FROM sakila.`nicer_but_slower_film_list`&lt;/blockquote&gt;&lt;p&gt;That's fine. You can inspect the results closer, but that's not what this topic is about, so I'll carry on... Now let's say you want to refine your job. Let's instead see how the pattern distribution is if we want to only look at a few categories of films. So I add a 'Equals' filter to only select &lt;b&gt;horror&lt;/b&gt;, &lt;b&gt;sports&lt;/b&gt; and &lt;b&gt;action&lt;/b&gt; movies and apply it to my pattern finder:&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-mJkbLj9VcVo/TvSVGExoh_I/AAAAAAAAALg/kITKrvCSS2w/s1600/pushdown1.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="133" width="320" src="http://3.bp.blogspot.com/-mJkbLj9VcVo/TvSVGExoh_I/AAAAAAAAALg/kITKrvCSS2w/s320/pushdown1.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;If we run the job, and inspect the log file again, we see now this entry:&lt;/p&gt;&lt;blockquote&gt;Executing query: SELECT `nicer_but_slower_film_list`.`title`, `nicer_but_slower_film_list`.`category` FROM sakila.`nicer_but_slower_film_list` WHERE (`nicer_but_slower_film_list`.`category` = '&lt;b&gt;Horror&lt;/b&gt;' OR `nicer_but_slower_film_list`.`category` = '&lt;b&gt;Action&lt;/b&gt;' OR `nicer_but_slower_film_list`.`category` = '&lt;b&gt;Sports&lt;/b&gt;')&lt;/blockquote&gt;&lt;p&gt;What's surprising here is that the filter actually got query optimized. Not all filters have this ability, since some of them have richer functionality than can be expressed as a query modification. But some of them do, and typically these are the small functions that make a big difference.&lt;/p&gt;&lt;p&gt;Let's also apply a Max rows filter that limits the analysis for only &lt;i&gt;20 records&lt;/i&gt; and chain it so that it depends on the Equals filter:&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-BNIEAW3Rwd8/TvSYKa5EqDI/AAAAAAAAALs/FzhD09PrrHc/s1600/pushdown2.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="133" width="320" src="http://4.bp.blogspot.com/-BNIEAW3Rwd8/TvSYKa5EqDI/AAAAAAAAALs/FzhD09PrrHc/s320/pushdown2.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;p&gt;If we now run the job, both filters will have been applied to the query:&lt;/p&gt;&lt;blockquote&gt;Executing query: SELECT `nicer_but_slower_film_list`.`title`, `nicer_but_slower_film_list`.`category` FROM sakila.`nicer_but_slower_film_list` WHERE (`nicer_but_slower_film_list`.`category` = 'Horror' OR `nicer_but_slower_film_list`.`category` = 'Action' OR `nicer_but_slower_film_list`.`category` = 'Sports') &lt;b&gt;LIMIT 20&lt;/b&gt;&lt;/blockquote&gt;&lt;p&gt;That means that we do as much as we can to optimize the query, without ever having to ask the user to help us. So if you modify the logical job, the physical queries are automatically adapted! This is why push down query optimization is a superior optimization technique to raw SQL. Happy data cleaning!&lt;/p&gt;&lt;p&gt;&lt;b&gt;Additional information for developers&lt;/b&gt;: If you're developing plugins to DataCleaner and want to make a query optimized filter, then simply make sure you implement the &lt;a target="_blank" href="http://analyzerbeans.eobjects.org/apidocs/org/eobjects/analyzer/beans/api/QueryOptimizedFilter.html"&gt;QueryOptimizedFilter&lt;/a&gt; interface! Happy coding!&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5302041660818199228?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5302041660818199228/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5302041660818199228' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5302041660818199228'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5302041660818199228'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/12/push-down-query-optimization-in.html' title='Push down query optimization in DataCleaner'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/-Kohc9pltOJ4/TvSSQVqCZsI/AAAAAAAAALU/LshXyh5FeJw/s72-c/pushdown0.jpg' height='72' width='72'/><thr:total>2</thr:total><georss:featurename>Copenhagen, Denmark</georss:featurename><georss:point>55.6760968 12.5683371</georss:point><georss:box>55.604469300000005 12.4104086 55.7477243 12.726265600000001</georss:box></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5482131391570646686</id><published>2011-11-29T13:35:00.001+01:00</published><updated>2011-11-29T23:04:55.235+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='performance'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='etl'/><category scheme='http://www.blogger.com/atom/ns#' term='engine'/><category scheme='http://www.blogger.com/atom/ns#' term='multithreading'/><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='data processing'/><title type='text'>The DataCleaner engine explained</title><content type='html'>&lt;img src="http://datacleaner.eobjects.org/resources/screenshots/dc_2.1_c_small.png" alt="" style="float: right; border: none;"/&gt;&lt;p&gt;For this blog entry I have decided to record a short video instead of write till my fingers fall off :) So I present to you: My &lt;b&gt;videoblog&lt;/b&gt; entry about &lt;b&gt;DataCleaner's data quality processing engine&lt;/b&gt;, and how it compares to traditional &lt;b&gt;ETL engines&lt;/b&gt;.&lt;/p&gt;&lt;p&gt;The DataCleaner engine was created from the ground-up to be optimized for Data Quality projects. It performs superiorly to any other engine that we've looked at, which I think is a pretty nice archievement. In the video I try to explain what makes it different!&lt;/p&gt;&lt;div style="clear:both;"&gt;&lt;/div&gt;        &lt;div&gt;            &lt;object classid="clsid:D27CDB6E-AE6D-11cf-96B8-444553540000" width="640" height="498" id="csSWF"&gt;                &lt;param name="movie" value="http://datacleaner.eobjects.org/resources/webcasts/etlightweight_controller.swf" /&gt;                &lt;param name="quality" value="best" /&gt;                &lt;param name="bgcolor" value="#1a1a1a" /&gt;                &lt;param name="allowfullscreen" value="true" /&gt;                &lt;param name="scale" value="showall" /&gt;                &lt;param name="allowscriptaccess" value="always" /&gt;                &lt;param name="flashvars" value="autostart=false&amp;thumb=http://datacleaner.eobjects.org/resources/webcasts/etlightweight_firstframe.png&amp;thumbscale=45&amp;color=0x000000,0x000000" /&gt;                &lt;!--[if !IE]&gt;--&gt;                &lt;object type="application/x-shockwave-flash" data="http://datacleaner.eobjects.org/resources/webcasts/etlightweight_controller.swf" width="640" height="498"&gt;                    &lt;param name="quality" value="best" /&gt;                    &lt;param name="bgcolor" value="#1a1a1a" /&gt;                    &lt;param name="allowfullscreen" value="true" /&gt;                    &lt;param name="scale" value="showall" /&gt;                    &lt;param name="allowscriptaccess" value="always" /&gt;                    &lt;param name="flashvars" value="autostart=false&amp;thumb=http://datacleaner.eobjects.org/resources/webcasts/etlightweight_firstframe.png&amp;thumbscale=45&amp;color=0x000000,0x000000" /&gt;                &lt;!--&lt;![endif]--&gt;                    &lt;div id="noUpdate"&gt;                        &lt;p&gt;The Camtasia Studio video content presented here requires JavaScript to be enabled and the latest version of the Adobe Flash Player. If you are using a browser with JavaScript disabled please enable it now. Otherwise, please update your version of the free Adobe Flash Player by &lt;a href="http://www.adobe.com/go/getflashplayer"&gt;downloading here&lt;/a&gt;. &lt;/p&gt;                    &lt;/div&gt;                &lt;!--[if !IE]&gt;--&gt;                &lt;/object&gt;                &lt;!--&lt;![endif]--&gt;            &lt;/object&gt;        &lt;/div&gt;&lt;p&gt;Enjoy using &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt; :)&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5482131391570646686?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5482131391570646686/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5482131391570646686' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5482131391570646686'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5482131391570646686'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/11/datacleaner-engine-explained.html' title='The DataCleaner engine explained'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1987285229066651165</id><published>2011-10-30T10:33:00.000+01:00</published><updated>2011-10-30T10:33:07.156+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='formatting'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='standardizing'/><category scheme='http://www.blogger.com/atom/ns#' term='data format'/><category scheme='http://www.blogger.com/atom/ns#' term='iso date'/><category scheme='http://www.blogger.com/atom/ns#' term='date format'/><category scheme='http://www.blogger.com/atom/ns#' term='date'/><title type='text'>Standardize the date formats in your data</title><content type='html'>One of the things that I see sometimes is that web forms cause unstandardized data in your database. For example, text fields in web forms do not have a native way to specify the type of the data. So what if you have a field that is supposed to be a date? For example the birthdate of your web users? A lot of web applications are not performing real validations of the format and content of the data entered into such fields. I think this typically occurs because it was not thought of as important at the time of designing the initial web page. But maybe it will become important at a point in time if eg. you want to analyze the age groups of your users! The trouble is that later on in the applications lifecycle, a state of unchangeability enters because you're stuck with a bunch of unstandardized data that you cannot conform to a new standardized data format. This is because you will have a lot of different date formats represented. For example:&lt;br /&gt;&lt;div&gt;&lt;ul&gt;&lt;li&gt;2011-10-30&lt;/li&gt;&lt;li&gt;20111030&lt;/li&gt;&lt;li&gt;30th of October, 2011&lt;/li&gt;&lt;li&gt;30/10/11&lt;/li&gt;&lt;/ul&gt;&lt;div&gt;&lt;div&gt;And maybe some even more exotic ones...&lt;/div&gt;&lt;div&gt;In this blog entry I will show you how to solve that migration issue with the use of &lt;a href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt;.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;/div&gt;&lt;/div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-size: large;"&gt;1. Date mask matching&lt;/span&gt;&lt;/div&gt;&lt;div&gt;The first thing we should do is to analyze which date patterns are present in the data. To do this you need to combine two components: The &lt;b&gt;Date mask matcher &lt;/b&gt;and the &lt;b&gt;Boolean analyzer&lt;/b&gt;. Here are the steps involved.&lt;/div&gt;&lt;div&gt;&lt;ol&gt;&lt;li&gt;First set up you datastore in the welcome screen of DataCleaner.&lt;/li&gt;&lt;li&gt;Click the "Analyze!" button to begin composing your job.&lt;/li&gt;&lt;li&gt;In the tree to the left, select the columns of interest - in our example at least the &lt;i&gt;birthdate&lt;/i&gt; column.&lt;/li&gt;&lt;li&gt;Click "Add transformer -&amp;gt; Matching and standardization -&amp;gt; Date mask matcher".&lt;/li&gt;&lt;/ol&gt;&lt;div&gt;Your screen will now look something like this:&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-0cL2g6Cm-HA/Tq0Rwxv4etI/AAAAAAAAAB8/6v5Flyer_Sg/s1600/datestd1.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="271" src="http://2.bp.blogspot.com/-0cL2g6Cm-HA/Tq0Rwxv4etI/AAAAAAAAAB8/6v5Flyer_Sg/s320/datestd1.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;In the middle of the screen you see a list of date masks. Each of these produce a boolean output column (seen below). The idea of the Date mask matcher is that it creates the boolean columns so that you can even assert if a particular date is parseable by using several date masks. That's because a single date string like "080910" can be understood in many ways!&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-size: large;"&gt;2. Analyzing matches&lt;/span&gt;&lt;/div&gt;&lt;div&gt;Moving on, we want to see how well our dates match against the date masks. Since all the matches are now stored in boolean columns, we can apply the Boolean analyzer. Here are the steps involved:&lt;/div&gt;&lt;div&gt;&lt;ol&gt;&lt;li&gt;Click "Add analyzer -&amp;gt; Boolean analyzer".&lt;/li&gt;&lt;li&gt;Make sure all the transformed boolean columns are checked.&lt;/li&gt;&lt;li&gt;Click the "Run analysis" button.&lt;/li&gt;&lt;li&gt;Wait for the analysis to run.&lt;/li&gt;&lt;/ol&gt;&lt;div&gt;Your screen will now contain an analysis result like this:&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-6Su83jYmPyY/Tq0TTT1EOvI/AAAAAAAAACE/OiWgQRJg7N4/s1600/stddates2.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="150" src="http://3.bp.blogspot.com/-6Su83jYmPyY/Tq0TTT1EOvI/AAAAAAAAACE/OiWgQRJg7N4/s320/stddates2.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;The result has two parts: The &lt;i&gt;Column statistics&lt;/i&gt; and the&amp;nbsp;&lt;i&gt;Frequency of combinations&lt;/i&gt;.&lt;/div&gt;&lt;div&gt;In the column statistics you can see how much individual date masks have been matched. In our example we can see that 4 of our date masks (no. 2, 3, 5 and 6) are not matched at all, so we may consider removing them from the Date mask matcher.&lt;/div&gt;&lt;div&gt;In the frequency of combinations we get a view of the rows and which match combinations are frequent and less frequent. The most frequent combination is that our date mask no. 1 is the only valid mask. The second most frequent combination (&lt;i&gt;Combination 1&lt;/i&gt;) is that none of the date masks apply. If you click the green arrow to the right of the combination you will see which records fall into that category. In our example that looks like this:&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-xO91UlD13cY/Tq0UYxHusUI/AAAAAAAAACM/hAA61vTCgvk/s1600/stddates3.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="115" src="http://2.bp.blogspot.com/-xO91UlD13cY/Tq0UYxHusUI/AAAAAAAAACM/hAA61vTCgvk/s320/stddates3.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;This gives us a good hint about which date masks we need to add to our date mask matcher.&lt;/div&gt;&lt;div&gt;The "1982.03.21" date is a simple case - we should simply create a date mask like this: &lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;YYYY.MM.dd&lt;/span&gt;&lt;/div&gt;&lt;div&gt;The "11th of march, 1982" date is a bit more complex. We need to allow the date mask to have a literal string part (the "th of" part) and it needs to recognize the month by name ("march"), not by number. Fortunately this is still possible, the date mask looks like this: &lt;span class="Apple-style-span" style="font-family: 'Courier New', Courier, monospace;"&gt;dd'th of' MMMMM, YYYY&lt;/span&gt;&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;&lt;span class="Apple-style-span" style="font-size: large;"&gt;3. Converting to dates&lt;/span&gt;&lt;/div&gt;&lt;div&gt;While we could continue to refine the analysis, this is a blog, not a reference manual and I want to cut to the chase - the actual migration to standardized dates!&lt;/div&gt;&lt;div&gt;So let us look at how you can convert your date strings to actual date fields which you can then choose to format using a standardized format. To do this, click "Add transformer -&amp;gt; Conversion -&amp;gt; Convert to date". You will now see a configuration panel like this:&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-T-H9VnDx2f0/Tq0Wb5-wJkI/AAAAAAAAACU/-KeYZmSOPJU/s1600/stddates4.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="298" src="http://3.bp.blogspot.com/-T-H9VnDx2f0/Tq0Wb5-wJkI/AAAAAAAAACU/-KeYZmSOPJU/s320/stddates4.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;In here you also see a list of example date masks. Click the plus-button to add additional date masks to convert by. The converter will try from the top to convert if it can, so in case you have cases like "091011" then you have to make your choice here (I would recommend based on your analysis).&lt;/div&gt;&lt;div&gt;We add the few masks that are relevant for our example:&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-ZMtw85qvbto/Tq0XV8PoEhI/AAAAAAAAACc/y9eC58yTwA0/s1600/stddates5.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="227" src="http://2.bp.blogspot.com/-ZMtw85qvbto/Tq0XV8PoEhI/AAAAAAAAACc/y9eC58yTwA0/s320/stddates5.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;And we verify that there are no immediate unrecognized dates, by clicking the "Preview data" button:&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-S1_6MtdCofo/Tq0Xf_VjvsI/AAAAAAAAACk/9txEUJDjp7s/s1600/stddates6.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="320" src="http://1.bp.blogspot.com/-S1_6MtdCofo/Tq0Xf_VjvsI/AAAAAAAAACk/9txEUJDjp7s/s320/stddates6.png" width="299" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;If a date is not recognized and converted, then the output column will have a null instead of a date. Therefore a good practice would be to look for null values and eg. save them to an error handling file. To do this, here's what we do:&lt;/div&gt;&lt;div&gt;&lt;ol&gt;&lt;li&gt;Go to the &lt;i&gt;Filters&lt;/i&gt; tab.&lt;/li&gt;&lt;li&gt;Click "Add filter -&amp;gt; Not null".&lt;/li&gt;&lt;li&gt;Select only your converted column (in our example "birthdate (as date)").&lt;/li&gt;&lt;li&gt;Click "INVALID -&amp;gt; Write to CSV file".&lt;/li&gt;&lt;li&gt;Select the desired columns for the error handling file.&lt;/li&gt;&lt;li&gt;Optionally right click the "Write to CSV file" tab and select "Rename component" to give it a name like "write errors".&lt;/li&gt;&lt;li&gt;Go back to the &lt;i&gt;Filters &lt;/i&gt;tab and click the VALID button to write the valid records to a CSV file or a spreadsheet.&lt;/li&gt;&lt;/ol&gt;&lt;div&gt;After these steps, you should be able to inspect your job flow by clicking the &lt;i&gt;Visualize&lt;/i&gt; button:&lt;/div&gt;&lt;/div&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-fjwWo0T1cgQ/Tq0Y7cVJkLI/AAAAAAAAACs/wSY0S9PXRpo/s1600/stddates7.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="90" src="http://3.bp.blogspot.com/-fjwWo0T1cgQ/Tq0Y7cVJkLI/AAAAAAAAACs/wSY0S9PXRpo/s320/stddates7.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;div&gt;Now your date standardization job is ready to run again and again to enforce standardized dates!&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1987285229066651165?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1987285229066651165/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1987285229066651165' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1987285229066651165'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1987285229066651165'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/10/standardize-date-formats-in-your-data.html' title='Standardize the date formats in your data'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/-0cL2g6Cm-HA/Tq0Rwxv4etI/AAAAAAAAAB8/6v5Flyer_Sg/s72-c/datestd1.png' height='72' width='72'/><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-681696985697644318</id><published>2011-09-26T20:37:00.000+02:00</published><updated>2011-09-28T13:21:01.048+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sql'/><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='dqa'/><category scheme='http://www.blogger.com/atom/ns#' term='pattern'/><category scheme='http://www.blogger.com/atom/ns#' term='profiling'/><title type='text'>Data Profiling SQLized. Uh oh...</title><content type='html'>&lt;div style="clear: both; float: right; width: 260px; background-color: #f0f0f0; border: 1px solid gray; padding: 5px; margin-left: 5px;"&gt;&lt;img height="320" src="http://3.bp.blogspot.com/-KBvH5qyXKbc/ToDELopw8tI/AAAAAAAAAB4/hGUEv4R8Vck/s320/scrooge" width="249" /&gt;&lt;p&gt;What does 'Scrooge' and 'Kasper' have in common? Not much according to my SQL data profiler.&lt;/p&gt;&lt;/div&gt;Some months back (admittedly, more than a couple) I was &lt;a href="http://kasper.eobjects.org/2011/01/its-very-easy-to-make-your-own-data.html"&gt;explaining&lt;/a&gt; how I think people tend do "home made data profiling" too often because it apparently seems easy to do in SQL. I went on to promise that I would also try to play the devil's advocate and show a few examples of "copy paste queries" that you can use for such a tool. In this blog post I will try to do so. But let me first say:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Don't do this at home kids!&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;p&gt;But let's start with the first query that I would add to my home baked profiling SQL script. We'll do what anyone who hasn't really understood what profiling is all about will tell you to do: Do a column analysis based on the metrics that are easily available through all SQL implementations:&lt;p&gt;&lt;blockquote&gt;SELECT MAX(column), MIN(column), COUNT(column), COUNT(*)&lt;br/&gt;FROM table;&lt;/blockquote&gt;&lt;p&gt;This is a good query, especially for number columns. Here I would typically look of the MIN value is below zero or not. Is the COUNT(column) equal to the COUNT(*)? If not, it means that there are nulls in the column. Why not just do two separate queries, that would be more readable? Yes, but it also makes my script larger and I will have more stuff to maintain. But let's try it, we can actually improve it also by adding a few metrics:&lt;/p&gt;&lt;blockquote style="clear: both;"&gt;SELECT MAX(column) AS &lt;b&gt;highest_value&lt;/b&gt; FROM table;&lt;br /&gt;SELECT MIN(column) AS &lt;b&gt;lowest_positive_value&lt;/b&gt; FROM table WHERE column &amp;gt; 0;&lt;br /&gt;SELECT MIN(column) AS&amp;nbsp;&lt;b&gt;lowest_negative_value&lt;/b&gt;&amp;nbsp;FROM table WHERE column &amp;lt; 0;&lt;br /&gt;SELECT COUNT(*) AS &lt;b&gt;num_values&lt;/b&gt; FROM table WHERE column IS NOT NULL;&lt;br /&gt;SELECT COUNT(*) AS &lt;b&gt;num_nulls&lt;/b&gt; FROM table WHERE column IS NULL;&lt;/blockquote&gt;&lt;p&gt;Now let's continue with some string columns, because I think more often than not, this is where data profiling turns out to be really valuable. Something that I often see as an inconsistency in structured string data is case differences. Such inconsistencies makes reporting and analysis of the data cumbersome and error prone because grouping and filtering will ultimately be inprecise. So let's do a case analysis:&lt;/p&gt;&lt;blockquote&gt;SELECT COUNT(*) AS &lt;b&gt;num_lowercase&lt;/b&gt; FROM table WHERE LCASE(column) = column;&lt;br /&gt;SELECT COUNT(*) AS &lt;b&gt;num_uppercase&lt;/b&gt; FROM table WHERE UCASE(column) = column;&lt;br /&gt;SELECT COUNT(*) AS &lt;b&gt;num_mixed_case&lt;/b&gt; FROM table WHERE LCASE(column) &amp;lt;&amp;gt; column AND UCASE(column) &amp;lt;&amp;gt; column;&lt;/blockquote&gt;&lt;p&gt;And then on to query the always popular "first letter is capitalized" type of strings. This one really depends on the database, because substring functions have not been standardized across major SQL implementations. I'll show a few:&lt;p&gt;&lt;p&gt;INITCAP-based approach (eg. PostgreSQL and Oracle):&lt;/p&gt;&lt;blockquote&gt;SELECT COUNT(*) AS &lt;b&gt;num_first_letter_capitalized&lt;/b&gt; FROM table WHERE INITCAP(column) = column;&lt;/blockquote&gt;&lt;p&gt;SUBSTRING-based approach (eg. Microsoft SQL Server):&lt;/p&gt;&lt;blockquote&gt;SELECT COUNT(*) AS &lt;b&gt;num_first_letter_capitalized&lt;/b&gt; FROM table&lt;br /&gt;WHERE UCASE(SUBSTR(column FROM 0 FOR 1)) =&amp;nbsp;SUBSTR(column FROM 0 FOR 1)&lt;br /&gt;AND LCASE(SUBSTR(column FROM 1)) = SUBSTR(column FROM 1)&lt;/blockquote&gt;A bit cumbersome, but get's the job done. Being the devil's advocate, I'm still not convinced that I should throw out my home baked SQL just yet. So I'm ready for another challenge!&lt;br /&gt;&lt;br /&gt;Let's have a look at pattern finding through SQL. Again this is perfectly possible. I've even heard many people telling me that we should rewrite &lt;a href="http://datacleaner.eobjects.org" target="_blank"&gt;DataCleaner&lt;/a&gt;'s Pattern Finder to make it SQL optimized. Read on and judge for yourself :-)&lt;br /&gt;&lt;br /&gt;To match tokens by pattern we apply the simplest possible configuration in DataCleaner's pattern finder: All letters are replaced by 'A' or 'a' and all numbers are replaced by '9'. This makes for a nice pattern based matcher, like this:&lt;br /&gt;&lt;blockquote&gt;Mickey Mouse -&amp;gt; 'Aaaaaa Aaaaa'&lt;br /&gt;Minnie Mouse -&amp;gt; 'Aaaaaa Aaaaa'&lt;br /&gt;Joachim von And -&amp;gt; 'Aaaaaaa aaa Aaa'&lt;br /&gt;kasper@eobjects.dk -&amp;gt; 'aaaaaa@aaaaaaa.aa'&lt;/blockquote&gt;&lt;i&gt;(Random fact: 'Joachim von And' is the Danish name for Scrooge McDuck)&lt;/i&gt;&lt;br /&gt;As you can see from the patterns, this is a good preliminary way to determine if string values have the same form and syntax - we immediately see that the email address is odd and that although all other values look like valid names, som have lowercase tokens (prefixes) inbetween.&lt;br /&gt;&lt;br /&gt;In PostgreSQL for example, this would look like:&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;SELECT regexp_replace(regexp_replace(regexp_replace(column, '[a-z]','a','g'), '[A-Z]','A','g'), '[0-9]','9','g') as &lt;b&gt;pattern&lt;/b&gt;, COUNT(*) as &lt;b&gt;pattern_count&lt;/b&gt;&amp;nbsp;from table GROUP BY pattern;&lt;/blockquote&gt;&lt;br /&gt;This actually works like a charm and returns:&lt;br /&gt;&lt;br /&gt;&lt;table border="0" cellpadding="0" cellspacing="0" class="pretty-table"&gt;&lt;tbody&gt;&lt;tr&gt;&lt;th&gt;pattern&lt;/th&gt;&lt;th&gt;pattern_count&lt;/th&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Aaaaaa Aaaaa&lt;/td&gt;&lt;td&gt;2&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Aaaaaaa aaa Aaa&lt;/td&gt;&lt;td&gt;1&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;aaaaaa@aaaaaaaa.aa&lt;/td&gt;&lt;td&gt;1&lt;/td&gt;&lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;So why even use a profiling tool for finding patterns? All this seems to be possible through raw SQL?&lt;br /&gt;&lt;br /&gt;I will now stop playing the devil's advocate... Cuz' seriously... This is nonsense! Having worked for some years on a pretty good &lt;a href="http://datacleaner.eobjects.org" target="_blank"&gt;data quality analysis tool&lt;/a&gt;, this approach absolutely disgusts me. Here's just a few random reasons why, off the top of my head:&lt;br /&gt;&lt;br /&gt;&lt;ul&gt;&lt;li&gt;We still haven't scratched the surface when it comes to supporting eg. non-ASCII characters in patterns.&lt;/li&gt;&lt;li&gt;Some tokens in patterns should be matched regardless of string length, some shouldn't. In our case we never matched strings with unequal lengths (eg. Scrooge and Mickey). This is a setting that you will want to play around with! (For examples, check out our &lt;a href="http://datacleaner.eobjects.org/resources/docs/2.3/html/ch04s07.html" target="_blank"&gt;Pattern Finder documentation&lt;/a&gt;)&lt;/li&gt;&lt;li&gt;Each metric in the previous analyses required their own query. This means that if you want to analyze a hundred metrics, you would need to query (at least) a hundred times.&lt;/li&gt;&lt;li&gt;A lot of metrics are simply not possible to express in SQL. Some examples: Diacritic character count, max/min amount of words, matches against reference data and more.&lt;/li&gt;&lt;li&gt;Often you will want to preprocess data before (or actually I would argue, as a part of) your profiling. This can be for example to extract information from composite values or to replace known inconsistencies with standardized values.&lt;/li&gt;&lt;li&gt;All the examples offer no drill-to-detail behaviour, so further analysis is more or less impossible. And drill-to-detail is not offered through SQL, so there is for example no way to express in our pattern finder SQL that we want to keep some samples of various pattern matches for later inspection.&lt;/li&gt;&lt;li&gt;All in all, using SQL for data profiling makes for a terribly unexplorative approach. It's a pain having to write and modify such an amount of SQL to get simple things done, so don't rely on it, because it will make you lazy and then you'll not investigate properly!&lt;/li&gt;&lt;li&gt;And of course, SQL only applies to databases that support SQL! If you're looking to profile data in other formats, then you're out of luck with this approach.&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-681696985697644318?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/681696985697644318/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=681696985697644318' title='7 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/681696985697644318'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/681696985697644318'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/09/data-profiling-sqlized-uh-oh.html' title='Data Profiling SQLized. Uh oh...'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/-KBvH5qyXKbc/ToDELopw8tI/AAAAAAAAAB4/hGUEv4R8Vck/s72-c/scrooge' height='72' width='72'/><thr:total>7</thr:total><georss:featurename>Copenhagen, Denmark</georss:featurename><georss:point>55.6760968 12.5683371</georss:point><georss:box>55.604469300000005 12.4104086 55.7477243 12.726265600000001</georss:box></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1248894766615174243</id><published>2011-08-15T15:19:00.000+02:00</published><updated>2011-08-15T15:19:47.229+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='ftr'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='data entry'/><category scheme='http://www.blogger.com/atom/ns#' term='contact data'/><category scheme='http://www.blogger.com/atom/ns#' term='first time right'/><category scheme='http://www.blogger.com/atom/ns#' term='customer data'/><title type='text'>Get your data right... First Time Right!</title><content type='html'>In my blog I mostly talk about data quality tools like &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt; that are &lt;i&gt;diagnostic&lt;/i&gt; and &lt;i&gt;treating&lt;/i&gt;, rather than &lt;i&gt;preventive&lt;/i&gt;. Such tools have a lot of merit and strengths, but for a total view on data quality it is crucial that you also include tools that are preventive of poor data ever entering your system. In this blog post I want to talk a bit about a project that I have been involved with at &lt;a href="http://www.humaninference.com"&gt;Human Inference&lt;/a&gt; which is just that - our &lt;a href="http://www.humaninference.com/solutions/first-time-right"&gt;First Time Right&lt;/a&gt; JavaScript solution.&lt;br /&gt;&lt;br /&gt;The idea is that we provide a subscription-based JavaScript API where you can easily decorate any HTML contact form with a lot of rich features for on-the-fly verification, validation, auto correction and helpful features for automatic filling of derived fields.&lt;br /&gt;&lt;br /&gt;For example, the API allows you to enter (or copy/paste) a full name, including titulation, salutation, initials and more - and get these items parsed and placed into corresponding fields on a detailed contact form. It will even automatically detect what the gender of the contact is, and apply this in gender fields. We have similar data entry aids for address input, email input, phone numbers and contact duplicate checking.&lt;br /&gt;&lt;br /&gt;Take a look at the video below, which demonstrate most of the features:&lt;br /&gt;&lt;br /&gt;&lt;iframe width="640" height="390" src="http://www.youtube.com/embed/BN80Ezyo2WY?rel=0" frameborder="0" allowfullscreen&gt;&lt;/iframe&gt;&lt;br /&gt;&lt;br /&gt;Now this is quite exciting functionality, but this is also a technical blog, so I'll talk a bit about the technology involved.&lt;br /&gt;&lt;br /&gt;We built the project based on &lt;a href="http://code.google.com/webtoolkit/"&gt;Google Web Toolkit&lt;/a&gt; (GWT). GWT enables us to build a very rich application, entirely in JavaScript, so that it can be embedded on any website - no matter if it's PHP based, ASP.NET based, Java based or whatever. Of course we do have a server-side piece that the JavaScript communicates with, but that is all hosted at Human Inferences cloud platform. So in other words: The deployment of our First Time Right principle is a breeze!&lt;br /&gt;&lt;br /&gt;Since AJAX applications require locality of the server that it is communicating with, we've had to overcome quite some issues to allow the JavaScript to be external from the deployment sites. This is crucial as we want upgrades and improvements to be performed on our premises, not at individual customer sites. This way we can really leverage the cloud- and subscription-based approach to data quality. Our solution to the locality problem has been the &lt;a href="http://en.wikipedia.org/wiki/JSONP"&gt;JSONP&lt;/a&gt; approach, which is an alternative protocol for implementing AJAX behaviour. JSONP is a rather clever construct where instead of issuing actual HTTP requests, you insert new &amp;lt;script&amp;gt; elements into the HTML DOM at runtime! This means that the browser will perform a new request simply because the &amp;lt;script&amp;gt; element refers a new JavaScript source. It's not "pretty" to tackle errorhandling and the asynchronicity that this approach brings on, but we've done a lot of work to get it right, and it works like a charm! I hope to share some of our design patterns later, to demonstrate how it works.&lt;br /&gt;&lt;br /&gt;Another challenge was of security. Obviously you will want to make sure that the JavaScript is only available for subscribers. And only for the websites that they've subscribed to (because otherwise the JavaScript can simply be copied to another website). Our way around this resembles how for example Google manages their subscriptions to Google Maps and other subscription services, where you need a site-specific API key. Very clever.&lt;br /&gt;&lt;br /&gt;A few optional features may require some local add-on deployment. In particular, deduplication requires us to know the contact data to use as the source for detecting if a new contact is a duplicate. Here we have two options: On-premise installation of the deduplication engine or hooking up with our cloud-based deduplication engine, which can be configured to sync with your datastores.&lt;br /&gt;&lt;br /&gt;All in all I am quite enthusiastic about the FTR solution and the technology behind the solution. I also think that our FTR API is an example of a lightweight approach to implementing Data Quality, which complements DataCleaner very well. Both tools are extremely useful for ensuring a high level of data quality, and both tools are very intuitive and flexible in the way you can deploy them.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1248894766615174243?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1248894766615174243/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1248894766615174243' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1248894766615174243'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1248894766615174243'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/08/get-your-data-right-first-time-right.html' title='Get your data right... First Time Right!'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://img.youtube.com/vi/BN80Ezyo2WY/default.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5929710753353792861</id><published>2011-08-04T13:46:00.008+02:00</published><updated>2011-08-05T16:44:40.426+02:00</updated><title type='text'>Eye candy in Java 7: New javadoc style!</title><content type='html'>By now most of you've probably heard that Java 7 is out and there's a lot of discussions about new features, the loop optimization bug and general adoption.&lt;br /&gt;&lt;br /&gt;But one of the things in Java 7 which has escaped most people attention (I think) is the new javadoc style.&lt;br /&gt;&lt;br /&gt;Check it out:&lt;br /&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;width: 400px; height: 273px;" src="http://3.bp.blogspot.com/-1Ha-DY-m8H4/TjqHNfBFdvI/AAAAAAAAAB0/-xSPHAjg4r0/s400/mm-apidocs.jpg" border="0" alt=""id="BLOGGER_PHOTO_ID_5636966549341697778" /&gt;&lt;br /&gt;&lt;br /&gt;And see it live - we've just published an updated &lt;a href="http://metamodel.eobjects.org/apidocs/"&gt;API documentation for MetaModel&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5929710753353792861?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5929710753353792861/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5929710753353792861' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5929710753353792861'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5929710753353792861'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/08/eye-candy-in-java-7-new-javadoc-style.html' title='Eye candy in Java 7: New javadoc style!'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/-1Ha-DY-m8H4/TjqHNfBFdvI/AAAAAAAAAB0/-xSPHAjg4r0/s72-c/mm-apidocs.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-9142009061836454629</id><published>2011-08-01T21:59:00.013+02:00</published><updated>2011-08-01T23:02:51.673+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='csv'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='unittest'/><category scheme='http://www.blogger.com/atom/ns#' term='excel'/><category scheme='http://www.blogger.com/atom/ns#' term='column'/><category scheme='http://www.blogger.com/atom/ns#' term='verify'/><category scheme='http://www.blogger.com/atom/ns#' term='test'/><category scheme='http://www.blogger.com/atom/ns#' term='unit'/><category scheme='http://www.blogger.com/atom/ns#' term='table'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><title type='text'>Unit test your data</title><content type='html'>&lt;div&gt;&lt;img style="float:right; margin: 10px; width: 200px;" src="http://1.bp.blogspot.com/-K_DhJcZF5Qs/TjcP41U4fpI/AAAAAAAAABs/s7OHbwhJfG4/s400/testing.jpg" border="0" alt=""id="BLOGGER_PHOTO_ID_5635990927739223698" /&gt;In modern software development &lt;a href="http://en.wikipedia.org/wiki/Unit_testing"&gt;unit testing&lt;/a&gt; is widely used as a way to check the quality of your code. For those of you who are not software developers, the idea in unit testing is that you define rules for your code that you check again and again, to verify that your code works, and keep on working.&lt;/div&gt;&lt;div&gt;Unit testing and data quality has quite a lot in common in my oppinion. Both code and data change over time, so there is a constant need to keep checking that you code/data has the desired characteristics. This was something that I was &lt;a href="http://datacleaner.eobjects.org/topic/206/2-1-1-Pattern-Finder-predefined-token"&gt;recently&lt;/a&gt; reminded of by a &lt;a href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt; user on our forums.&lt;/div&gt;&lt;div&gt;I am happy to see that data stewards and the like are picking up this idea, as it has been maturing for quite some time in the software development industry. It also got me thinking: In software development we have a lot of related methods and practices around unit testing. Let me try to list a few, which are very important, and which we can perhaps also apply to data?&lt;/div&gt;&lt;table class="pretty-table" border="0" cellspacing="0" cellpadding="0"&gt;&lt;tbody&gt;&lt;tr&gt;&lt;th&gt;Code&lt;/th&gt;&lt;th&gt;Data&lt;/th&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Compile-time checking&lt;br /&gt;(Ensuring correct syntax)&lt;/td&gt;&lt;td&gt;Database constraints&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Unit testing&lt;br /&gt;(Checking a single unit of code)&lt;/td&gt;&lt;td&gt;Validating data profiling?&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Continuous integration&lt;br /&gt;(Running all tests periodically)&lt;/td&gt;&lt;td&gt;Data Quality monitoring?&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Bug tracking&lt;br /&gt;(Maintaining records of all code issues)&lt;/td&gt;&lt;td&gt;?&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Static code analysis&lt;br /&gt;(a la &lt;a href="http://findbugs.sourceforge.net/" target="_blank"&gt;FindBugs&lt;/a&gt;)&lt;/td&gt;&lt;td&gt;Explorative data profiling?&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Refactoring&lt;br /&gt;(Changing code without breaking functionality)&lt;/td&gt;&lt;td&gt;ETL with applied DQ rules?&lt;/td&gt;&lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;&lt;br /&gt;&lt;div&gt;&lt;small&gt;For explanation of the various data profiling and monitoring types, please refer to my previous post, &lt;a href="http://kasper.eobjects.org/2011/04/two-types-of-data-profiling.html"&gt;Two types of data profiling&lt;/a&gt;.&lt;/small&gt;&lt;/div&gt;&lt;div&gt;Of course not all metaphors here map one-to-one, but in my oppinion it is a pretty good metaphor. For me, as a software product developer, I think it also points out some of the weak and strong points of current Data Quality tools. In software development the tool support for unit testing, continuous integration, bug tracking and more is incredible. In the data world I feel that many tools focus only on one or two of the above areas of quality control. Of course you can combine tools, but as I've argued before, &lt;a href="http://www.datavaluetalk.com/2011/02/17/data-quality-analysis-%E2%80%93-it-requires-a-bit-of-all-worlds/" target="_blank"&gt;switching tools also comes at a large price&lt;/a&gt;.&lt;/div&gt;&lt;div&gt;So what do I suggest? Well, fellow product developers, let's make better tools that integrate more disciplines of data quality! I know that this has been and still will be my aim for &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt;.&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-9142009061836454629?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/9142009061836454629/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=9142009061836454629' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/9142009061836454629'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/9142009061836454629'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/08/unit-test-your-data.html' title='Unit test your data'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/-K_DhJcZF5Qs/TjcP41U4fpI/AAAAAAAAABs/s7OHbwhJfG4/s72-c/testing.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-8656861766343453178</id><published>2011-07-14T21:59:00.003+02:00</published><updated>2011-07-14T22:03:06.103+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='distinct'/><category scheme='http://www.blogger.com/atom/ns#' term='frequency'/><category scheme='http://www.blogger.com/atom/ns#' term='value distribution'/><category scheme='http://www.blogger.com/atom/ns#' term='chart'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='data profiling'/><title type='text'>A colorful value distribution</title><content type='html'>A few weeks ago I was dealing a bit of attention to the charts in &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt;. Of special interest are the &lt;a href="http://datacleaner.eobjects.org/topic/200/DataCleaner-2-1-3D-charts-" target="_blank"&gt;value distribution charts&lt;/a&gt; which has caused some discussions...&lt;br /&gt;&lt;br /&gt;Anyways, here's a proposal which includes nicer (IMO) coloring, a "distinct count measure", a dedicated "&amp;lt;blank&amp;gt;" keyword and a few other niceties.&lt;br /&gt;&lt;br /&gt;&lt;table style="width:194px;"&gt;&lt;tr&gt;&lt;td align="center" style="height:194px;background:url(https://picasaweb.google.com/s/c/transparent_album_background.gif) no-repeat left"&gt;&lt;a href="https://picasaweb.google.com/115504339658300272373/ValueDistributionChartProposals?authuser=0&amp;feat=embedwebsite"&gt;&lt;img src="https://lh6.googleusercontent.com/-JsEpTTpZINo/ThQ9q01vOWE/AAAAAAAAABA/apkqt2qn45k/s160-c/ValueDistributionChartProposals.jpg" width="160" height="160" style="margin:1px 0 0 4px;"&gt;&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td style="text-align:center;font-family:arial,sans-serif;font-size:11px"&gt;&lt;a href="https://picasaweb.google.com/115504339658300272373/ValueDistributionChartProposals?authuser=0&amp;feat=embedwebsite" style="color:#4D4D4D;font-weight:bold;text-decoration:none;"&gt;Value distribution chart proposals&lt;/a&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;&lt;br /&gt;&lt;br /&gt;You can expect to see this live in DataCleaner 2.3 which is expected in august.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-8656861766343453178?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/8656861766343453178/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=8656861766343453178' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8656861766343453178'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8656861766343453178'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/07/colorful-value-distribution.html' title='A colorful value distribution'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='https://lh6.googleusercontent.com/-JsEpTTpZINo/ThQ9q01vOWE/AAAAAAAAABA/apkqt2qn45k/s72-c/ValueDistributionChartProposals.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-8377472360516677398</id><published>2011-07-08T17:56:00.017+02:00</published><updated>2011-07-11T15:39:48.504+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='csv'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='write'/><category scheme='http://www.blogger.com/atom/ns#' term='api'/><category scheme='http://www.blogger.com/atom/ns#' term='create table'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='excel'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><category scheme='http://www.blogger.com/atom/ns#' term='proposal'/><category scheme='http://www.blogger.com/atom/ns#' term='jdbc'/><category scheme='http://www.blogger.com/atom/ns#' term='insert'/><title type='text'>Proposal for writing data in MetaModel 2.0</title><content type='html'>Hi everyone,&lt;br /&gt;&lt;br /&gt;For a long time we've had a lot of people asking "can I use MetaModel to not only read varying data formats, but also to write data?". So far the answer has been "no, &lt;a href="http://metamodel.eobjects.org"&gt;MetaModel&lt;/a&gt; is a read-only API". But lately we've been working at &lt;a href="http://www.humaninference.com" target="_blank"&gt;Human Inference&lt;/a&gt; on a proposal for an API to write to the same &lt;a href="http://metamodel.eobjects.org/apidocs/org/eobjects/metamodel/DataContext.html" target="_blank"&gt;DataContexts&lt;/a&gt; as you read from, in MetaModel.&lt;br /&gt;&lt;br /&gt;Here's a glimpse of the API, by example. Currently we have fluent API's for creating tables and inserting rows:&lt;br /&gt;&lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;UpdateableDataContext dc = ...&lt;br /&gt;Schema schema = dc.getDefaultSchema();&lt;br /&gt;Table table = dc.createTable(schema, "my_table") &lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.withColumn("id").ofType(INTEGER)&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.withColumn("name").ofType("VARCHAR").ofSize(255)&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.execute();&lt;br /&gt;dc.insertInto(table).value("id",1).value("name","john doe").execute();&lt;br /&gt;dc.insertInto(table).value("id",2).value("name","jane doe").execute();&lt;/pre&gt;&lt;br /&gt;This API has so far been implemented succesfully for Excel spreadsheets, CSV files and JDBC databases - our 3 most used datastore types.&lt;br /&gt;&lt;br /&gt;You can find the work-in-progress of the proposal in SVN at:&lt;br /&gt;&lt;a href="http://eobjects.org/svn/MetaModel/branches/2.0-writable-datacontext/" target="_blank"&gt;http://eobjects.org/svn/MetaModel/branches/2.0-writable-datacontext/&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;We would like to get your reactions on the API proposal. Does it suit your needs and do you like the approach? Will it be acceptible to launch 2.0 with just these "CREATE TABLE" and "INSERT" operations, or will other operations (such as DELETE, UPDATE, DROP, ALTER) be needed before it makes up a valid solution for you guys?&lt;br /&gt;&lt;br /&gt;Best regards,&lt;br /&gt;Kasper&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight:bold;"&gt;Update 2011-07-11&lt;/span&gt;&lt;br /&gt;A few people have provided feedback (thank you for that) and also some performance tests on our side revealed that we need to apply a more batch-friendly approach, which also has better encapsulation and isolation properties for multiple and large updates. So, we've instead applied a pattern similar to Spring's template or Akka's atomic STM pattern. The idea is that the user supplies an UpdateScript which will be executed in isolation, like this:&lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;UpdateableDataContext dc = ...&lt;br /&gt;final Schema schema = dc.getDefaultSchema();&lt;br /&gt;dc.executeUpdate(new UpdateScript() {&lt;br /&gt;&amp;nbsp;&amp;nbsp;public void run(UpdateCallback &lt;b&gt;callback&lt;/b&gt;) {&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;Table table = &lt;b&gt;callback&lt;/b&gt;.createTable(schema, "my_table") &lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;.withColumn("id").ofType(INTEGER)&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;.withColumn("name").ofType("VARCHAR").ofSize(255)&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;.execute();&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&lt;b&gt;callback&lt;/b&gt;.insertInto(table).value("id",1).value("name","john doe").execute();&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&lt;b&gt;callback&lt;/b&gt;.insertInto(table).value("id",2).value("name","jane doe").execute();&lt;br /&gt;&amp;nbsp;&amp;nbsp;}&lt;br /&gt;});&lt;/pre&gt;&lt;br /&gt;On first sight it might not look quite as elegant, but I think that in the big picture this pattern is actually a lot nicer. First of all because it gives you a very clear understanding of exactly where in your code you modify your data. It also makes it a lot easier to write eg. fallback-scripts in case something goes wrong with your update. For datastore types that support transactions (eg. JDBC databases) it also makes it possible for us to easily demarcate the transactional boundaries.&lt;br /&gt;&lt;br /&gt;... Please keep posting feedback!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-8377472360516677398?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/8377472360516677398/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=8377472360516677398' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8377472360516677398'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8377472360516677398'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/07/proposal-for-writing-data-in-metamodel.html' title='Proposal for writing data in MetaModel 2.0'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-7738364652789785533</id><published>2011-06-27T22:05:00.011+02:00</published><updated>2011-06-27T22:28:51.296+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='extension'/><category scheme='http://www.blogger.com/atom/ns#' term='add-on'/><category scheme='http://www.blogger.com/atom/ns#' term='plugin'/><category scheme='http://www.blogger.com/atom/ns#' term='programming'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='data profiling'/><title type='text'>Developing DataCleaner extensions</title><content type='html'>&lt;p&gt;Today has been all about the &lt;a href="http://datacleaner.eobjects.org"&gt;release of DataCleaner 2.2&lt;/a&gt;. This is a significant release of our &lt;a href="http://www.datavaluetalk.com/2011/02/17/data-quality-analysis-%E2%80%93-it-requires-a-bit-of-all-worlds/" target="_blank"&gt;Data Quality Analysis product&lt;/a&gt; which I think is becoming more and more mature and capable.&lt;/p&gt;&lt;p&gt;One of the really neat things in DataCleaner 2.2 is it's extensibility. Lots of applications are extensible, but few are in my oppinion as easy to approach as DataCleaner. We expose a limited API which is extremely flexible though. This makes it easy for developers to explore the opportunities and the architecture.&lt;/p&gt;&lt;p&gt;Another great strengths of DataCleaner's extension architecture is the &lt;a href="http://datacleaner.eobjects.org/extensions" target="_blank"&gt;ExtensionSwap&lt;/a&gt;. With a click in the browser you can install an extension onto a running application. Personally I think it's a quite jaw-dropping effect when you see the seamlessnes of the integration here.&lt;/p&gt;&lt;p&gt;I've recorded this webcast demonstration for developers who want to get started, or just feel curious on how our developer API works.&lt;/p&gt;&lt;object classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000" codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab" width="640" height="498" id="mymoviename"&gt;&lt;param name="movie" value="example.swf"&gt;&lt;param name="quality" value="high"&gt;&lt;param name="bgcolor" value="#ffffff"&gt;&lt;embed src="http://datacleaner.eobjects.org/resources/webcasts/extension_development_controller.swf" quality="high" bgcolor="#ffffff" name="mymoviename" align="" width="640" height="498" type="application/x-shockwave-flash" pluginspage="http://www.macromedia.com/go/getflashplayer"&gt;&lt;/embed&gt;&lt;/object&gt;&lt;p&gt;Also, a few nice resources for you to investigate further:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;Our &lt;a href="http://datacleaner.eobjects.org/resources/docs/2.2/html" target="_blank"&gt;reference documentation&lt;/a&gt; now contains a "developers guide" with lots of nice info on extension packaging and more.&lt;/li&gt;&lt;li&gt;The &lt;a href="http://datacleaner.eobjects.org/develop" target="_blank"&gt;Develop page&lt;/a&gt; on the website contains links to various previous blog entries and instructions. These are still valid even though the 2.2 API has been elaborated.&lt;/li&gt;&lt;li&gt;And of course, check out the &lt;a href="http://eobjects.org/datacleaner/apidocs/current/" target="_blank"&gt;javadoc API documentation&lt;/a&gt; for DataCleaner.&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;Lastly I want to point out that not only is DataCleaner 2.2 extensible, it is also &lt;b&gt;embeddable&lt;/b&gt;. You can now grab DataCleaner in the central Maven repo's and you can bootstrap the application in a really easy fashion:&lt;/p&gt;&lt;blockquote class="prettyprint lang-java"&gt;BootstrapOptions bootstrapOptions = new DefaultBootstrapOptions(args);&lt;br /&gt;Bootstrap bootstrap = new Bootstrap(bootstrapOptions);&lt;br /&gt;bootstrap.run();&lt;/blockquote&gt;&lt;br /&gt;For more info, check out the chapter "Embedding DataCleaner" in the &lt;a href="http://datacleaner.eobjects.org/resources/docs/2.2/html/"&gt;reference documentation&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-7738364652789785533?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/7738364652789785533/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=7738364652789785533' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7738364652789785533'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7738364652789785533'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/06/developing-datacleaner-extensions.html' title='Developing DataCleaner extensions'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-9082335629674289771</id><published>2011-06-20T20:27:00.001+02:00</published><updated>2011-06-20T22:02:18.593+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open source'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='sassyreader'/><category scheme='http://www.blogger.com/atom/ns#' term='sas7bdat'/><category scheme='http://www.blogger.com/atom/ns#' term='free'/><category scheme='http://www.blogger.com/atom/ns#' term='read'/><category scheme='http://www.blogger.com/atom/ns#' term='file'/><category scheme='http://www.blogger.com/atom/ns#' term='sas'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><title type='text'>SassyReader - Open Source reader of SAS data sets for Java</title><content type='html'>I'm quite excited to announce the first release of a brand new eobjects.org project: &lt;a href="http://sassyreader.eobjects.org"&gt;SassyReader&lt;/a&gt;. SassyReader is in my oppinion in deed something sassy as it fills a gap that has long existed in open source applications that deals with data management (ETL tools, tools like &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt; and the like). SassyReader is a library for reading data in the sas7bdat format, aka. the format that the &lt;a href="http://www.sas.com" target="_blank"&gt;SAS&lt;/a&gt; statistical software use! It is written entirely in Java and reads the files from their binary format (eg. it's not a connector to the SAS system, but a reader of the raw data).&lt;br /&gt;&lt;br /&gt;&lt;a href="http://sassyreader.eobjects.org" imageanchor="1" style="clear:right; float:right; margin-left:1em; margin-bottom:1em"&gt;&lt;img border="0" height="249" width="320" src="http://1.bp.blogspot.com/-ZIcMJjT2WdE/Tf-LtfnKUAI/AAAAAAAAAIU/w1-5zgddTlI/s320/sassyreader.jpg" alt="Visit the SassyReader website" /&gt;&lt;/a&gt;So why is this important? Well first of all because it is very difficult to create systems that interoperate with SAS. SAS does ship a &lt;a href="http://support.sas.com/documentation/onlinedoc/jdbc/index.html" target="_blank"&gt;JDBC driver&lt;/a&gt; but it's compliancy with JDBC is actually very limited. Even creating a connection will typically require use of SAS's proprietary classes, so you cannot go the standards JDBC way. There is also no JDBC metadata support and you need to set up a server-side SAS/SHARE option to even expose the connection. Furthermore this is an add-on product from SAS which costs additional money if you're just a base SAS user. So doing trivial things like connecting and querying a data set requires a lot of work and money. In my oppinion this is poor practice - a legacy way of trying to lock people in to using only a particular brand of software, simply because interoperability is a big pain.&lt;br /&gt;&lt;br /&gt;All in all I see a great benefit in a project like SassyReader for those who simply want a way of reading the data that is stored in SAS files.&lt;br /&gt;&lt;br /&gt;I cannot take a whole lot of credit for this project though. Most of the really challenging stuff was created by Matt Shotwell, aka. BioStatMatt, who founded the &lt;a href="https://github.com/biostatmatt/sas7bdat"&gt;sas7bdat&lt;/a&gt; project which is written in R. My contribution was to port it to Java and fix a few issues on the way. Matt put together a lot of fractioned works that describe various findings about the sas7bdat format. In other words this is a completely reverse engineered library, based on analysis of actual sas7bdat files. During the last months we've had a good conversation going and actually fixing some of the remaining issues in parallel and bringing additions to each other's code.&lt;br /&gt;&lt;br /&gt;Today we've released version 0.1 of SassyReader. It's not yet ready for mission critical use as there are still quirks in the format that we haven't figured out. Also there are different shapes and sizes within the format that vary apparently depending on (I'm a bit guessing here) the amount of columns and the operating system that the file was written with. The good thing is that we have a quite extensive test set and for at least the files that I had lying around that I wanted to work with the reader managed to read all but one (11 out of 12)!&lt;br /&gt;&lt;br /&gt;Please visit the &lt;a href="http://sassyreader.eobjects.org"&gt;SassyReader website&lt;/a&gt; for more details, and let me know your feedback!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-9082335629674289771?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/9082335629674289771/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=9082335629674289771' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/9082335629674289771'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/9082335629674289771'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/06/sassyreader-open-source-reader-of-sas.html' title='SassyReader - Open Source reader of SAS data sets for Java'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/-ZIcMJjT2WdE/Tf-LtfnKUAI/AAAAAAAAAIU/w1-5zgddTlI/s72-c/sassyreader.jpg' height='72' width='72'/><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3375192329361224326</id><published>2011-05-31T22:03:00.000+02:00</published><updated>2011-05-31T22:03:59.357+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='quality'/><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='value'/><category scheme='http://www.blogger.com/atom/ns#' term='dqa'/><category scheme='http://www.blogger.com/atom/ns#' term='data profiling'/><title type='text'>The value of data quality</title><content type='html'>The other day I stumbled into an &lt;a href="http://gojko.net/2011/05/17/bug-statistics-are-a-waste-of-time/"&gt;interesting blog post&lt;/a&gt; about software quality, and this brilliant quote (by Alan Weiss) kept flashing on my internal billboard over and over again...&lt;br /&gt;&lt;blockquote&gt;Quality [...] is not the absence of something in management's eyes, that is, defects, but the presence of something in the consumer's eyes, that is, &lt;i&gt;value&lt;/i&gt;.&lt;/blockquote&gt;What struck me was an explanation for one of the findings that I quite often get when people tell me that they are using &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt; or competing tools, and they find value in even some of the simplest functionalities in there - That even the simplest of features can provide a fortune of value. Coming from the world of tools and product development we tend to look at feature comparisons and technical capabilities a lot. And even customers also use such arguments for choosing a tool. And of course it makes a lot of sense because quality, as in the amount of value provided for the consumer, is very hard to measure and compare.&lt;br /&gt;&lt;br /&gt;So how can a software product deliver high quality, in the sense of consumer value? I believe it is a mix of making the product fairly easy to use as well as solving concrete problems for the consumer. In DataCleaner we've done a lot of work to make the tool work as a generic Data Quality Analysis (DQA) tool for a wide variety of data types. But maybe we should also consider building more domain targeted packages where you can easily do a "data value assesment" (to twist the words of Alan Weiss a little) for particular domains, eg. customer data, product data, geographic data and more.&lt;br /&gt;&lt;br /&gt;What do you think? Should data profiling and DQA stay generic, or should it target specific domains? Can it do both?&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3375192329361224326?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3375192329361224326/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3375192329361224326' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3375192329361224326'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3375192329361224326'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/05/value-of-data-quality.html' title='The value of data quality'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-7492100839843139087</id><published>2011-05-13T19:12:00.002+02:00</published><updated>2011-05-13T19:13:27.748+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sql'/><category scheme='http://www.blogger.com/atom/ns#' term='performance'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='speed'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='fetch'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='optimization'/><category scheme='http://www.blogger.com/atom/ns#' term='fast'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><category scheme='http://www.blogger.com/atom/ns#' term='size'/><category scheme='http://www.blogger.com/atom/ns#' term='jdbc'/><title type='text'>Speed up your JDBC queries with MetaModel</title><content type='html'>Although the recent &lt;a href="http://metamodel.eobjects.org"&gt;MetaModel&lt;/a&gt; version 1.7.1 was a minor release there is something about this release that excites me very much! That thing is our new &lt;b&gt;FETCH_SIZE calculator&lt;/b&gt;. So what is that you might say? Well, here’s a little background.&lt;br /&gt;&lt;br /&gt;In JDBC there is an option to set the so-called FETCH_SIZE of a query statement, like this:&lt;br /&gt;&lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;Statement st = connection.createStatement();&lt;br /&gt;st.&lt;a href="http://download.oracle.com/javase/6/docs/api/java/sql/Statement.html#setFetchSize%28int%29"&gt;setFetchSize&lt;/a&gt;(&lt;b&gt;10000&lt;/b&gt;);&lt;br /&gt;ResultSet rs = st.executeQuery(...);&lt;/pre&gt;&lt;br /&gt;What this does is that it tells the database how many rows should be fetched from the database when more rows are needed. In other words: A buffer size, measured in the &lt;b&gt;amount of rows&lt;/b&gt;.&lt;br /&gt;&lt;br /&gt;Cool – so this means you should set a rather high fetch size if you have plenty of memory? Say 20.000 records? Well, sometimes yes, but not quite at all times!&lt;br /&gt;&lt;br /&gt;The guys over at Oracle has done an exemplary work on &lt;a href="http://www.oracle.com/technetwork/database/enterprise-edition/memory.pdf"&gt;documenting the memory consumption&lt;/a&gt; of their latest JDBC driver. From this you can learn that if you put a too high fetch size, you might run out of memory! But on the other hand you would really like to optimize the buffering of your result sets! The trouble is that the row size in memory is hugely different if you select 1, 2, 10 or 300 columns in your query! Furthermore the size of a VARCHAR(4000) value in the buffer is about 8000 times larger than a BIT value! These findings lead to a very nice new feature in MetaModel: Dynamically &lt;b&gt;setting an appropriate fetch size, depending on the query you are executing&lt;/b&gt;...&lt;br /&gt;&lt;br /&gt;How does this work? Well basically MetaModel has all the needed meta-information for doing a strictly mathematical calculation. MetaModel knows the data types of all your columns, it knows their column sizes and most importantly – it knows which columns you are querying! So if we want to allow eg. up to 8 megs of memory to be used for the query buffer, then it’s very easy for us to figure out the needed FETCH_SIZE – in principle it’s just to divide the available memory with the size of a single row in the buffer.&lt;br /&gt;&lt;br /&gt;So far I’ve tested it out together with some friends of &lt;a href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt; that have both small tables and 200-300 column tables and they see a dramatic improvement in performance as well as it prevents a few OutOfMemory issues!&lt;br /&gt;&lt;br /&gt;A final remark – I haven’t seen any other Java frameworks that do something like this. Even Hibernate only has the option to specify a constant FETCH_SIZE:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;property name="hibernate.jdbc.fetch_size"&amp;gt;10000&amp;lt;/property&amp;gt;&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;But as explained in this blog entry you will hopefully agree that such an approach is not optimal unless your application only executes a single query again and again!&lt;br /&gt;&lt;br /&gt;For more information on MetaModel, please &lt;a href="http://metamodel.eobjects.org"&gt;visit the website&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-7492100839843139087?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/7492100839843139087/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=7492100839843139087' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7492100839843139087'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7492100839843139087'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/05/speed-up-your-jdbc-queries-with.html' title='Speed up your JDBC queries with MetaModel'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3745591835322867600</id><published>2011-04-13T21:32:00.000+02:00</published><updated>2011-04-13T21:35:25.242+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='discovery'/><category scheme='http://www.blogger.com/atom/ns#' term='exploration'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='data profiling'/><title type='text'>Two types of data profiling</title><content type='html'>Recently I've been blogging about how I see that DataCleaner is what I would dub a '&lt;a href="http://www.datavaluetalk.com/2011/02/17/data-quality-analysis-%E2%80%93-it-requires-a-bit-of-all-worlds/"&gt;Data Quality Analysis (DQA) tool&lt;/a&gt;' more than anything else. This leads me to an explanation of what I mean by DQA tool, profiling tool and more.&lt;br /&gt;&lt;br /&gt;So...&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://4.bp.blogspot.com/-iS5ARsPcAOM/TaX6OQefKsI/AAAAAAAAAII/Ysza_Y1O_Wg/s1600/profiling.jpg" imageanchor="1" style="clear:right; float:right; margin-left:1em; margin-bottom:1em"&gt;&lt;img border="0" height="200" width="240" src="http://4.bp.blogspot.com/-iS5ARsPcAOM/TaX6OQefKsI/AAAAAAAAAII/Ysza_Y1O_Wg/s400/profiling.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;Data profiling is in my worldview the activity of extracting (and possibly refining) a set of analysis metrics from your data. As such it is a quite boring and trivial tasks that you can even automate rather easily.&lt;br /&gt;&lt;br /&gt;The interesting question is not &lt;b&gt;what&lt;/b&gt; data profiling is, but &lt;b&gt;why&lt;/b&gt; it is! I see two main reasons and they have quite different characteristics as to how you would use a profiling tool!&lt;br /&gt;&lt;br /&gt;The (chronologically) first reason that you would apply data profiling is to perform an analysis. Not a technical analysis, but an analysis where you apply your human reasoning. You investigate the metrics to &lt;i&gt;discover&lt;/i&gt; your data. If you're a good analyst you will also continuously refine your analysis, challenge it and change settings to see what happens. For this a profiling tool enables you to go below "the tip of the iceberg" (a common phrase about profiling) in your datastores.&lt;br /&gt;&lt;br /&gt;The second reason is for monitoring your data quality. A profiling tool has the power to extract the metrics so you will often see that people use profiling tools to perform a set of data quality validation tasks. It is used as a way to retain a quality level. In this case of data profiling you execute the same analysis again and again - only the data changes (over time).&lt;br /&gt;&lt;br /&gt;Do you see other applications of data profiling tools?&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3745591835322867600?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3745591835322867600/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3745591835322867600' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3745591835322867600'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3745591835322867600'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/04/two-types-of-data-profiling.html' title='Two types of data profiling'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/-iS5ARsPcAOM/TaX6OQefKsI/AAAAAAAAAII/Ysza_Y1O_Wg/s72-c/profiling.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5726731471870565092</id><published>2011-03-19T19:57:00.000+01:00</published><updated>2011-03-19T19:57:08.159+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='user experience'/><category scheme='http://www.blogger.com/atom/ns#' term='documents'/><category scheme='http://www.blogger.com/atom/ns#' term='ease of use'/><category scheme='http://www.blogger.com/atom/ns#' term='ui'/><category scheme='http://www.blogger.com/atom/ns#' term='user interface'/><category scheme='http://www.blogger.com/atom/ns#' term='window'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Coming up in DataCleaner: Single window UI</title><content type='html'>Lately I've been working on an fundamental improvement to the UI of &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt; that I think is quite important, and I would really like to present it to gather some reactions and ideas for further development.&lt;br /&gt;&lt;br /&gt;The idea was to change the UI metaphor with regards to windows. Currently you might say that the left hand window is the "environment" window - it contains management of datastores and reference data (dictionaries, synonyms, patterns) as well as the file menu which includes links to various global dialogs etc. The issue with this window is that it is an additional window that you always have to carry with you, even though you want to focus on a particular job. My aim was to find a way to remove the window, but retain all it's abilities, but make the job creation the main focus of the UI.&lt;br /&gt;&lt;br /&gt;So here's what we've done so far:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Moved datastore selection (and management) to the workbench (analysis job) window.&lt;/li&gt;&lt;li&gt;Moved the file menu to the workbench window.&lt;/li&gt;&lt;li&gt;Created a separate dialog for management of reference data, accessible through the file menu.&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;Here's a screenshot of the first concept, datastore selection and management. This is what you'll see when the app starts up (the cursor is hovering the MySQL icon, which is why it has an "information bubble"):&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-GVJhaQI_iyo/TYT6--SgADI/AAAAAAAAAHw/Q2sKQ6oGNSw/s1600/screenshot1_create_datastore.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="309" width="400" src="http://2.bp.blogspot.com/-GVJhaQI_iyo/TYT6--SgADI/AAAAAAAAAHw/Q2sKQ6oGNSw/s400/screenshot1_create_datastore.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;Once a datastore has been selected and the "Analyze!" button is clicked, the schema tree will become visible and will correspond to the analysis job window as you already know it.&lt;br /&gt;&lt;br /&gt;In the next screenshots you can see the contents of the new "Reference data" menu in the top of the screen:&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/-b5xL0Jalm3g/TYT7aUCSesI/AAAAAAAAAH4/CZJg9asoK7I/s1600/screenshot2_reference_data_menu.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="309" width="400" src="http://2.bp.blogspot.com/-b5xL0Jalm3g/TYT7aUCSesI/AAAAAAAAAH4/CZJg9asoK7I/s400/screenshot2_reference_data_menu.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;... and the dialog that it links to:&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-unHt4KrT02A/TYT7duKU2vI/AAAAAAAAAIA/3QSKa2fuZZQ/s1600/screenshot3_reference_data_dialog.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="400" width="331" src="http://1.bp.blogspot.com/-unHt4KrT02A/TYT7duKU2vI/AAAAAAAAAIA/3QSKa2fuZZQ/s400/screenshot3_reference_data_dialog.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;So what do you think? I hope that it's a bit more intuitive of course. The metaphor was to make something similar to a typical office application, where you begin with an empty document which you can build, or you can choose to open a saved document if you want to resume previous work.&lt;br /&gt;&lt;br /&gt;If you want to try it out, simply check out the trunk of AnalyzerBeans and DataCleaner and build it! ... Here's the command line summary:&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;svn co http://eobjects.org/svn/AnalyzerBeans/trunk/ AnalyzerBeans&lt;br /&gt;cd AnalyzerBeans&lt;br /&gt;mvn install&lt;br /&gt;cd ..&lt;br /&gt;svn co http://eobjects.org/svn/DataCleaner/trunk/ DataCleaner&lt;br /&gt;cd DataCleaner&lt;br /&gt;mvn install&lt;br /&gt;cd target&lt;br /&gt;java -jar DataCleaner.jar&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Enjoy and please provide feedback :)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5726731471870565092?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5726731471870565092/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5726731471870565092' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5726731471870565092'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5726731471870565092'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/03/coming-up-in-datacleaner-single-window.html' title='Coming up in DataCleaner: Single window UI'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/-GVJhaQI_iyo/TYT6--SgADI/AAAAAAAAAHw/Q2sKQ6oGNSw/s72-c/screenshot1_create_datastore.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3655934787919324899</id><published>2011-03-03T18:45:00.001+01:00</published><updated>2011-03-03T18:45:45.347+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='filtering'/><category scheme='http://www.blogger.com/atom/ns#' term='visualization'/><category scheme='http://www.blogger.com/atom/ns#' term='winfried'/><category scheme='http://www.blogger.com/atom/ns#' term='name'/><category scheme='http://www.blogger.com/atom/ns#' term='van'/><category scheme='http://www.blogger.com/atom/ns#' term='names'/><category scheme='http://www.blogger.com/atom/ns#' term='holland'/><category scheme='http://www.blogger.com/atom/ns#' term='flow'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>The complexity of cleansing a name</title><content type='html'>Following up on Winfried van Holland's blog post on  &lt;a href="http://www.datavaluetalk.com/2011/02/14/we-have-180-million-names-which-one-is-right/"&gt;having to cleanse 180 million names&lt;/a&gt; I thought I'd share a cool screenshot from a funny moment at the Human Inference office today, when we where experimenting with good ways to visualize our execution flow in DataCleaner...&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/-j-qUgWP8_Pw/TW_THTtcX7I/AAAAAAAAAHo/wWQbT_neiEk/s1600/complexity_of_cleansing_a_name.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="315" width="400" src="http://3.bp.blogspot.com/-j-qUgWP8_Pw/TW_THTtcX7I/AAAAAAAAAHo/wWQbT_neiEk/s400/complexity_of_cleansing_a_name.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;The diagram shows 1) the steps that we go through in our filtering process, 2) the dependencies between them, 3) and a circle layout that we tried out and dismissed ;-)&lt;br /&gt;&lt;br /&gt;Anyways, a funny moment and quite a nice view of just how much execution power you can extract from a tool like &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3655934787919324899?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3655934787919324899/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3655934787919324899' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3655934787919324899'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3655934787919324899'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/03/complety-of-cleansing-name.html' title='The complexity of cleansing a name'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/-j-qUgWP8_Pw/TW_THTtcX7I/AAAAAAAAAHo/wWQbT_neiEk/s72-c/complexity_of_cleansing_a_name.jpg' height='72' width='72'/><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2749725609255149078</id><published>2011-02-17T11:46:00.001+01:00</published><updated>2011-02-17T11:51:40.405+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='tools'/><category scheme='http://www.blogger.com/atom/ns#' term='tool support'/><category scheme='http://www.blogger.com/atom/ns#' term='filter'/><category scheme='http://www.blogger.com/atom/ns#' term='dqa'/><category scheme='http://www.blogger.com/atom/ns#' term='preprocessing'/><category scheme='http://www.blogger.com/atom/ns#' term='evolution'/><category scheme='http://www.blogger.com/atom/ns#' term='transformation'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Pre-processing in DataCleaner 2: Why?</title><content type='html'>&lt;p&gt;Last monday we &lt;a href="http://datacleaner.eobjects.org/newsitem/datacleaner-2.0-released"&gt;released the new DataCleaner 2.0&lt;/a&gt; and one of the major new features in it is the ability to transform and filter your data using the tool. Previously the common answer for someone asking about transformation in DataCleaner has been to tell them that "there are already good tools for this available elsewhere, so use them". So why did we choose to focus on data processing in DataCleaner 2.0? And what are the capabilities of DataCleaner in terms of transformations? Is DataCleaner over time going to evolve into a full-fledged ETL tool? I'll try to answer these questions.&lt;/p&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/-pQ2LkysulOs/TVz9oEWS5zI/AAAAAAAAAHg/ml8jl6SOE68/s1600/preprocessing.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="150" width="400" src="http://1.bp.blogspot.com/-pQ2LkysulOs/TVz9oEWS5zI/AAAAAAAAAHg/ml8jl6SOE68/s400/preprocessing.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;h3&gt;All your data quality functions in one place&lt;/h3&gt;&lt;p&gt;The obvious answer is that we want to provide more data quality functionality and that transformations is something that a lot of people need. While doing data profiling it is often needed to do data adjustments, eg. to tokenize values, extract certain information, filter which rows gets profiled etc. You can also do all this also by applying an ETL tool or maybe even by creating database VIEWs. The problem with such an approach is that it will eventually get in your way because you're trying to get 2-3 independent tools to work nicely together, instead of just having these functions available where you need them.&lt;/p&gt;&lt;h3&gt;Transformations for the DQ domain&lt;/h3&gt;&lt;p&gt;Possibly even more important is that the transformations that you want to employ in Data Quality analysis are typically quite different than those that come out-of-the-box in database scripts and ETL tools. Such tools are typically quite generic and will provide general purpose tokenizers etc., but will typically lack transformations pertaining to the DQ-domain, such as date mask matchers, dictionary lookups and synonym replacements, standardization of email adresses, names and URL's.&lt;/p&gt;&lt;h3&gt;Non-persistent transformations&lt;/h3&gt;&lt;p&gt;When you do pre-processing in separate tools, you also need to persist your transformed datasets. In Data Quality analysis this is just a waste of resources and provides poor performance. If you need to perform transformations, apply filtering etc. for the purpose of analysis, profiling and exploring your data it is much more feasible to just perform these transformations when needed in stead of storing them up front. This also allows for a much more free user experience where you can actually experiment with your data and you analysis in stead of having to overthink it.&lt;/p&gt;&lt;h3&gt;Is DataCleaner's validator gone? No, it's just filtering with added output handling!&lt;/h3&gt;&lt;p&gt;DataCleaner 1.x was known to have a strict separation between the activities "profile" and "validate". Since this separation is gone in 2.0, one might ask, "Is DataCleaner's validator gone? I can only see analysis jobs!". But the answer is no, we just consider validation as a type of analysis (thus, analysis is a broader term, comprising both profiling, validation and more). You can easily perform all the validation operations of DataCleaner 1.x, but the approach is a bit different because you basically apply filters in stead of "validators". There is even an example of this in the "&lt;a href="http://datacleaner.eobjects.org/media"&gt;Introduction to analyzing, cleansing and filtering data&lt;/a&gt;" webcast, available on DataCleaner's website.&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-2749725609255149078?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2749725609255149078/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=2749725609255149078' title='5 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2749725609255149078'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2749725609255149078'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html' title='Pre-processing in DataCleaner 2: Why?'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/-pQ2LkysulOs/TVz9oEWS5zI/AAAAAAAAAHg/ml8jl6SOE68/s72-c/preprocessing.jpg' height='72' width='72'/><thr:total>5</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-7775628543658251805</id><published>2011-02-14T11:42:00.010+01:00</published><updated>2011-02-14T12:25:52.391+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='humaninference'/><category scheme='http://www.blogger.com/atom/ns#' term='acquisition'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='human inference'/><category scheme='http://www.blogger.com/atom/ns#' term='release'/><category scheme='http://www.blogger.com/atom/ns#' term='job'/><category scheme='http://www.blogger.com/atom/ns#' term='dataqualitypro'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>DataCleaner 2 released, Human Inference acquires eobjects.org, I have a new job and dataqualitypro.com even publishes a nice article about it</title><content type='html'>Earlier I've promised you some "&lt;a href="http://kasper.eobjects.org/2011/02/big-news.html"&gt;Big news&lt;/a&gt;" today and here it is... Hmm, where to start, so much to say.&lt;br /&gt;&lt;br /&gt;&lt;hr/&gt;OK, let's start with the software, after all that's what I think most of my blog readers care most about:&lt;br /&gt;&lt;br /&gt;&lt;img border="0" height="100" width="100" style="float: right; border: none; margin: 10px;" src="http://2.bp.blogspot.com/-oBf_oF49Pz0/TVkJDKxGUpI/AAAAAAAAAG4/EXq2mjl79IU/s400/dc-logo.jpg" /&gt;&lt;br /&gt;&lt;h3&gt;DataCleaner 2.0 was released!&lt;/h3&gt;To me this is the biggest news for the DataCleaner community in a LONG time. DataCleaner 2.0 is a major release that I and my new employer (read more below) have put a lot of effort into. I just had a look at some source code statistics and actually the 2.0 release is larger (in terms of lines of code, source code commits, contributions etc.) than all previous DataCleaner releases together. I don't want to say a lot about the new functionality here, because it's all presented quite well at the &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner website&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;More information&lt;/b&gt;: &lt;a href="http://datacleaner.eobjects.org/newsitem/datacleaner-2.0-released"&gt;Watch out, dirty data! DataCleaner 2.0 is in town!&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;img border="0" height="80" width="80" style="float: right; border: none; margin: 10px;"  src="http://4.bp.blogspot.com/-mcJXP3MrQwo/TVkJSa6j9fI/AAAAAAAAAHA/wv8v9xIjjdM/s400/metamodel-logo.jpg" /&gt;&lt;br /&gt;&lt;h3&gt;MetaModel 1.5 was released!&lt;/h3&gt;My other lovechild, &lt;a href="http://metamodel.eobjects.org"&gt;MetaModel&lt;/a&gt;, have also just been released in a version 1.5! MetaModel 1.5 is also a quite big improvement on the previous 1.2 version. The release contains a lot of exciting new features for doing querying and datastore exploration as well as a lot of maturity bugfixes.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;More information&lt;/b&gt;: &lt;a href="http://metamodel.eobjects.org/whats_new.html"&gt;What's new in MetaModel 1.5?&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;hr/&gt;And then let's move on to a major announcement that I definately think will affect the eobjects.org community positively:&lt;br /&gt;&lt;br /&gt;&lt;img border="0" height="58" width="200"  style="float: right; border: none; margin: 10px;" src="http://4.bp.blogspot.com/-Y5JrVhdN_9I/TVkJazhl7CI/AAAAAAAAAHI/5Wtdvefaucs/s400/hi-logo.jpg" /&gt;&lt;br /&gt;&lt;h3&gt;Human Inference acquires eobjects.org&lt;/h3&gt;This might come as a surprise to quite a lot of you, so let me explain a bit. For some years DataCleaner and the other eobjects.org projects have been independent open source projects that I've invested a lot of time in. The projects have grown nicely in terms of users and the ideas have been manyfold. My ambitions for the projects have always been high, but they suffered from the fact that I was mostly working on them in my free time. One of the many fun things about doing these projects was that I've gotten to meet up with a lot of exciting people that thought my projects where interesting. At one time I met some people from the data quality vendor &lt;a href="http://www.humaninference.com"&gt;Human Inference&lt;/a&gt;, who thought DataCleaner was great and they wanted to know if they could in some way use it in collaboration with their commercial offerings. From my end of the table I was on the other hand thinking that their products offered some features that would be an excellent addition to DataCleaner's functionality. So what we did was a deal to try and raise the value for both parties. And with that in mind, here's the press release about it:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;More information&lt;/b&gt;: &lt;a href="http://www.humaninference.com/News%20-%20Events/News/Human%20Inference%20completes%20acquisition%20of%20DataCleaner%20and%20eobjects.aspx"&gt;Human Inference completes acquisition of DataCleaner and eobjects.org&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;h3&gt;I have a new job&lt;/h3&gt;I now work with Human Inference to actively grow the DataCleaner project, MetaModel as well as Human Inference's commercial products. We're building really exciting cloud-based data quality services that I think will complement the open source offering nicely. Of course it's not all going to be free, but I promise that even for the users who don't want to pay for the additional services, the acquisition and my new job will be beneficial anyway, because we're adding a lot of new resources to the projects that are improving on both the open source parts and the commercial plug-ins.&lt;br /&gt;&lt;br /&gt;&lt;hr/&gt;&lt;img border="0" height="48" width="200" style="float: right; border: none; margin: 10px;" src="http://3.bp.blogspot.com/-N8IR-4zG8tg/TVkJ0X6iSnI/AAAAAAAAAHY/sFigFRyFovw/s400/dataqualitypro-logo.jpg" /&gt;&lt;br /&gt;And in the end I just also want to mention that Data Quality Pro has a great article about a lot of these news, including an interview with me and Sabine Palinckx, the CEO of Human Inference.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;More information&lt;/b&gt;: &lt;a href="http://www.dataqualitypro.com/data-quality-home/open-source-datacleaner-gets-a-major-update-human-inference.html"&gt;Open Source DataCleaner gets a major update, Human Inference enters the Open Source Data Quality Market&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-7775628543658251805?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/7775628543658251805/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=7775628543658251805' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7775628543658251805'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7775628543658251805'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/datacleaner-2-released-human-inference.html' title='DataCleaner 2 released, Human Inference acquires eobjects.org, I have a new job and dataqualitypro.com even publishes a nice article about it'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/-oBf_oF49Pz0/TVkJDKxGUpI/AAAAAAAAAG4/EXq2mjl79IU/s72-c/dc-logo.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4030752379168024825</id><published>2011-02-04T12:42:00.002+01:00</published><updated>2011-02-04T12:42:36.796+01:00</updated><title type='text'>Big news ...</title><content type='html'>&lt;div dir="ltr" style="text-align: left;" trbidi="on"&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://1.bp.blogspot.com/_UpvxZrigQfQ/TUvmHPwb12I/AAAAAAAAAGw/ew2oCKhDiV4/s1600/qr.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="312" src="http://1.bp.blogspot.com/_UpvxZrigQfQ/TUvmHPwb12I/AAAAAAAAAGw/ew2oCKhDiV4/s400/qr.png" width="312" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4030752379168024825?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4030752379168024825/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4030752379168024825' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4030752379168024825'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4030752379168024825'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/big-news.html' title='Big news ...'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/_UpvxZrigQfQ/TUvmHPwb12I/AAAAAAAAAGw/ew2oCKhDiV4/s72-c/qr.png' height='72' width='72'/><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5215629668171603673</id><published>2011-01-18T15:59:00.002+01:00</published><updated>2011-01-18T16:01:13.423+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sql'/><category scheme='http://www.blogger.com/atom/ns#' term='scripts'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='dataprofiling'/><category scheme='http://www.blogger.com/atom/ns#' term='profiling'/><category scheme='http://www.blogger.com/atom/ns#' term='diy'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><category scheme='http://www.blogger.com/atom/ns#' term='data profiling'/><title type='text'>It’s very easy to make your own data profiling logic – but wrong!</title><content type='html'>&lt;a style="float: right"&gt;&lt;img border="0" height="64" width="64" src="http://1.bp.blogspot.com/_UpvxZrigQfQ/TTWqLaFYtrI/AAAAAAAAAGk/0FF7XpWjVeU/s400/sql.jpg" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;In the world of data quality I often see that people tend to think that a profiling application is that important because the user can easily analyze his data using SQL and similar methods already. It can actually make it quite difficult to convince users to try out a tool built for the job.&lt;br /&gt;&lt;br /&gt;Let me tell you about a situation that I have been in quite a few times: If I am to do a quick demonstration of &lt;a href="http://datacleaner.eobjects.org"&gt;my favourite data profiling application&lt;/a&gt; then I will begin with something like a &lt;a href="http://4.bp.blogspot.com/_UpvxZrigQfQ/TQd6IX99gCI/AAAAAAAAAF4/oFbG4RudgRs/s1600/value-dist-chart1.png"&gt;Value distribution chart&lt;/a&gt;. The responding watching the demo would then reply:&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;"... But I could easily do this with just a simple GROUP BY query."&lt;/blockquote&gt;&lt;br /&gt;And my answer would be YES, you can.&lt;br /&gt;&lt;br /&gt;So then I will proceed to demonstrate various other metrics such as null counts, empty string counts, length maximum/minimum etc. An SQL-hefty respondent might reply:&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;"... Sure, but I can also use AVG, WHERE x IS NULL, length functions etc. in SQL".&lt;/blockquote&gt;&lt;br /&gt;And my answer would be YES, you can.&lt;br /&gt;&lt;br /&gt;If at this point I dive into the discussion on these terms, then I would demonstrate hard-to-query measures such as diacritic count, word count, non-letter chars and case distribution. But ultimately I will also myself have misunderstood the point of using a data profiling application because the point isn’t (so much) that it contains measures that are not possible to express as queries.&lt;br /&gt;&lt;br /&gt;The point is that the measures and their impact to your data's profile are not always known to you in advance. If you know just what you’re looking for, then you’ll find it and nothing more.&lt;br /&gt;&lt;br /&gt;So while it’s quite easy (or at least possible) to design your own profiling plan using SQL and various other querying extensions, it’s not going to help you much in terms of profiling, because profiling should be an exploratory, analytical and interactive discipline.&lt;br /&gt;&lt;br /&gt;To scare you off, I think I’ll try creating a few SQL queries that could compensate for most of the functionality in &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt;. I imagine they’ll be quite amusing, so hold on for an update...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5215629668171603673?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5215629668171603673/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5215629668171603673' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5215629668171603673'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5215629668171603673'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/01/its-very-easy-to-make-your-own-data.html' title='It’s very easy to make your own data profiling logic – but wrong!'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/_UpvxZrigQfQ/TTWqLaFYtrI/AAAAAAAAAGk/0FF7XpWjVeU/s72-c/sql.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-102835014442214357</id><published>2010-12-29T19:25:00.002+01:00</published><updated>2010-12-30T09:36:12.036+01:00</updated><title type='text'>YADC2S</title><content type='html'>Yet Another DataCleaner 2.0 Screenshot :) This is basically just an addition to my previous post about &lt;a href="http://kasper.eobjects.org/2010/12/richer-reporting-and-charts-in.html"&gt;richer reporting and charts in DataCleaner 2.0&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_UpvxZrigQfQ/TRt8eyLStFI/AAAAAAAAAGc/wRqKw11wZRA/s1600/Screenshot-2.png" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="262" width="400" src="http://3.bp.blogspot.com/_UpvxZrigQfQ/TRt8eyLStFI/AAAAAAAAAGc/wRqKw11wZRA/s400/Screenshot-2.png" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;What you see is our new Date Gap Analyzer which can be used to plot a timeline based on FROM and TO dates in a dataset. The analyzer will display gaps in the timeline and overlaps (periods where more than one record exist). This should be pretty useful for finding errors in datasets that contain continuous activities.&lt;br /&gt;&lt;br /&gt;The chart is zoomable and scrollable so it is able of displaying quite a lot of data without harming the visual appearance.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-102835014442214357?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/102835014442214357/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=102835014442214357' title='7 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/102835014442214357'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/102835014442214357'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/12/yadc2s.html' title='YADC2S'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_UpvxZrigQfQ/TRt8eyLStFI/AAAAAAAAAGc/wRqKw11wZRA/s72-c/Screenshot-2.png' height='72' width='72'/><thr:total>7</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4668867609728248192</id><published>2010-12-27T09:51:00.003+01:00</published><updated>2010-12-27T21:10:55.061+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='heuristics'/><category scheme='http://www.blogger.com/atom/ns#' term='board'/><category scheme='http://www.blogger.com/atom/ns#' term='match'/><category scheme='http://www.blogger.com/atom/ns#' term='matching'/><category scheme='http://www.blogger.com/atom/ns#' term='game'/><title type='text'>Match! Boardgame about the heuristics in data matching(?)</title><content type='html'>This morning I was enjoying a bit of Good Clean Family Christmas TV You Can Trust and one of the subjects covered was a new Danish board game that you could spend your Christmas vacation playing. It's called &lt;a href="http://www.matchspillet.dk/"&gt;Match!&lt;/a&gt; and here I will try to outline the rules as I understand them:&lt;br /&gt;&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Each player has 3 game cards with a picture of something on their hand.&lt;/li&gt;&lt;li&gt;A picture is shown to all players and they now has to match that picture with one of the pictures on their hand.&lt;/li&gt;&lt;li&gt;In the example shown on the TV there was a picture of some sausages on a grill. The example matches of the three players where:&lt;ul&gt;&lt;li&gt;Danish politician Pia Kjærsgaard - both she and the sausages represent something very danish and something they'd like to put on a grill!&lt;/li&gt;&lt;li&gt;A used roll of toilet paper - related to a different kind of "sausage"!&lt;/li&gt;&lt;li&gt;A crowd at a musical festival - a place where you'd love to eat a grilled sausage.&lt;/li&gt;&lt;/ul&gt;&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_UpvxZrigQfQ/TRhUho_m94I/AAAAAAAAAGU/4_vMBE8d6Wg/s1600/Match_setup7_600.jpg" imageanchor="1" style="margin-left:1em; margin-right:1em"&gt;&lt;img border="0" height="150" width="400" src="http://3.bp.blogspot.com/_UpvxZrigQfQ/TRhUho_m94I/AAAAAAAAAGU/4_vMBE8d6Wg/s400/Match_setup7_600.jpg" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;The matches themselves where not the best I've seen but they do point out an important feature of a good matching engine: Using simple similarity checks is not enough. You need to understand not only the spelling, phonetics etc. of the things you are trying to match, but also the semantics. Of course a good example of this in Denmark is our country's biggest company: "Maersk", which can rather easily be matched with "Mærsk" but it's more difficult to get the synonym "A.P. Møller" into the matching rules except if you hardcode it somehow. And if matching goes beyond just names other associative matching rules might apply.&lt;br /&gt;&lt;br /&gt;Well... Can't wait to play Match! It sounds like a fun game and it will definately be in the back of my head to try and record some of the interesting heuristics applied there.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4668867609728248192?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4668867609728248192/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4668867609728248192' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4668867609728248192'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4668867609728248192'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/12/match-boardgame-about-heuristics-in.html' title='Match! Boardgame about the heuristics in data matching(?)'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_UpvxZrigQfQ/TRhUho_m94I/AAAAAAAAAGU/4_vMBE8d6Wg/s72-c/Match_setup7_600.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4114137400134142780</id><published>2010-12-14T15:27:00.000+01:00</published><updated>2010-12-14T15:27:31.839+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='similarity'/><category scheme='http://www.blogger.com/atom/ns#' term='value'/><category scheme='http://www.blogger.com/atom/ns#' term='distribution'/><category scheme='http://www.blogger.com/atom/ns#' term='levenshtein'/><category scheme='http://www.blogger.com/atom/ns#' term='chart'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='report'/><category scheme='http://www.blogger.com/atom/ns#' term='phonetic'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='metaphone'/><category scheme='http://www.blogger.com/atom/ns#' term='soundex'/><category scheme='http://www.blogger.com/atom/ns#' term='profiling'/><title type='text'>Richer reporting and charts in DataCleaner 2</title><content type='html'>One of the important new features of DataCleaner 2 will be a much richer reporting module than the old one. In DataCleaner 2 the result of an analysis is not limited to the crosstabular view that a lot of you know from DataCleaner 1.x. In this blog post I will provide you with a preview of some of the exciting reports that have been added lately.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Charts in Value distribution&lt;/b&gt;&lt;br /&gt;The Value distribution component is well-known to most DataCleaner users. It provides a simple but crucial look into the distribution of values for a column. In DataCleaner 2.0 we are enhancing the experience of working with the Value Distribution by applying visually pleasant charts as well as grouping of values with similar frequencies. Take a look at this example result on a country-column:&lt;br /&gt;&lt;br /&gt;&lt;div style="clear: both; text-align:center;"&gt;&lt;a href="http://4.bp.blogspot.com/_UpvxZrigQfQ/TQd6IX99gCI/AAAAAAAAAF4/oFbG4RudgRs/s1600/value-dist-chart1.png"&gt;&lt;img border="0" src="http://4.bp.blogspot.com/_UpvxZrigQfQ/TQd6IX99gCI/AAAAAAAAAF4/oFbG4RudgRs/s400/value-dist-chart1.png" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;Now you might think: "Looks nice, but that's going to be messy for columns with very oddly distributed values". And you're right. Except that we have applied a rather intelligent grouping mechanism that will make sure we never above a certain amount of slices in a chart. To accomplish this we may need to group together some values by their frequencies which will communicate another important fact: &lt;i&gt;When repeated values occur, how many times do they occur&lt;/i&gt;. Take a look at this next example of the value distribution of a &lt;i&gt;customer number&lt;/i&gt; column:&lt;br /&gt;&lt;br /&gt;&lt;div style="clear: both; text-align:center;"&gt;&lt;a href="http://1.bp.blogspot.com/_UpvxZrigQfQ/TQd6_-MIOXI/AAAAAAAAAGA/H7DwWKd9AMQ/s1600/value-dist-chart2.png"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/_UpvxZrigQfQ/TQd6_-MIOXI/AAAAAAAAAGA/H7DwWKd9AMQ/s400/value-dist-chart2.png" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;As you can see, even though there's a very high amount of customer numbers we are grouping them together by frequency. This is a principle that is actually already known from the &lt;b&gt;&amp;lt;unique&amp;gt;&lt;/b&gt; group, except that we now also apply it to further frequencies: &lt;b&gt;&amp;lt;group=2&amp;gt;&lt;/b&gt;, &lt;b&gt;&amp;lt;group=3&amp;gt;&lt;/b&gt; etc.&lt;br /&gt;&lt;br /&gt;Notice also the green arrows in the table to the right. Using this button (or by clicking the slices of the pie-chart) you will be able to &lt;b&gt;drill to detail&lt;/b&gt; to view the actual values that make up a given group.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Navigation tree in Phonetic similarity finder&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Another application of richer reporting in DataCleaner is for the new Phonetic similarity finder. In short this analyzer will apply a mix of well-known algorithms for similarity checking such as &lt;b&gt;Soundex&lt;/b&gt;, &lt;b&gt;Metaphone&lt;/b&gt; and &lt;b&gt;Levenshtein distance&lt;/b&gt; to produce a set of groups of similar sounding values. What you get is a tree of groups from where you can see the rows that are similar or maybe even identical:&lt;br /&gt;&lt;br /&gt;&lt;div style="clear: both; text-align:center;"&gt;&lt;a href="http://3.bp.blogspot.com/_UpvxZrigQfQ/TQd8mnXbjsI/AAAAAAAAAGI/jU_eV-E_dUg/s1600/phonetic-tree.png"&gt;&lt;img border="0" src="http://3.bp.blogspot.com/_UpvxZrigQfQ/TQd8mnXbjsI/AAAAAAAAAGI/jU_eV-E_dUg/s400/phonetic-tree.png" /&gt;&lt;/a&gt;&lt;/div&gt;&lt;br /&gt;The big news here is of course that this kind of result would be practically impossible to display in a crosstabular result of DataCleaner 1.x - which is also why DataCleaner 1.x doesn't have this feature. I hope that my message with this is clear: DataCleaner 2 will not only be a substantial improvement to the existing data profiling tool, but it will also open up a lot of new doors for more interactive (and interesting) analyses.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Pluggability&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The last thing that I would like to point out in this blog entry is the fact that the rendering mechanism in DataCleaner 2.0 is pluggable. This means that you can very easily, using modular Java code, enhance the existing result renderers or implement your own, and simply plug it into the application. Just remember to contribute it back to the community :)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4114137400134142780?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4114137400134142780/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4114137400134142780' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4114137400134142780'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4114137400134142780'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/12/richer-reporting-and-charts-in.html' title='Richer reporting and charts in DataCleaner 2'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_UpvxZrigQfQ/TQd6IX99gCI/AAAAAAAAAF4/oFbG4RudgRs/s72-c/value-dist-chart1.png' height='72' width='72'/><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1451733693272417256</id><published>2010-11-16T11:12:00.000+01:00</published><updated>2010-11-16T11:12:43.540+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='type parameter'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='generics'/><category scheme='http://www.blogger.com/atom/ns#' term='bug'/><category scheme='http://www.blogger.com/atom/ns#' term='compiler'/><category scheme='http://www.blogger.com/atom/ns#' term='jdk'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Java 6 compilation problems</title><content type='html'>If you're trying to build DataCleaner 2 and get problems like this ...&lt;br /&gt;&lt;blockquote&gt;...src\main\java\org\eobjects\datacleaner\widgets\properties\ChangeRequirementButton.java:[94,6] inconvertible types&lt;br /&gt;found   : org.eobjects.analyzer.job.builder.AbstractBeanWithInputColumnsBuilder&amp;lt;capture#10 of ?,capture#279 of ?,capture#140 of ?&amp;gt;&lt;br /&gt;required: org.eobjects.analyzer.job.builder.FilterJobBuilder&amp;lt;?,?&amp;gt;&lt;br /&gt;&lt;br /&gt;...src\main\java\org\eobjects\datacleaner\widgets\properties\MultipleInputColumnsPropertyWidget.java:[116,61] inconvertible types&lt;br /&gt;found   : org.eobjects.analyzer.job.builder.AbstractBeanJobBuilder&amp;lt;capture#0 of ?,capture#51 of ?,capture#202 of ?&amp;gt;&lt;br /&gt;required: org.eobjects.analyzer.job.builder.TransformerJobBuilder&amp;lt;?&amp;gt;&lt;br /&gt;&lt;br /&gt;...src\main\java\org\eobjects\datacleaner\widgets\properties\SingleInputColumnPropertyWidget.java:[73,61] inconvertible types&lt;br /&gt;found   : org.eobjects.analyzer.job.builder.AbstractBeanJobBuilder&amp;lt;capture#29 of ?,capture#564 of ?,capture#109 of ?&amp;gt;&lt;br /&gt;required: org.eobjects.analyzer.job.builder.TransformerJobBuilder&amp;lt;?&amp;gt;&lt;br /&gt;&lt;/blockquote&gt;... Then I just wanted to let you know that it's actually a compiler error, not a source code error. The Java 6 compiler (pre update 18 I believe) seems to not be able to cope with subtypes of generic interfaces which contain different type parameters than the interface. Of course this should be (and is in newer compiler versions) possible because a subtype may implement an interface with certain type parameters and define a new set of type parameters. So ... Update to the newest JDK please :)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1451733693272417256?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1451733693272417256/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1451733693272417256' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1451733693272417256'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1451733693272417256'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/11/java-6-compilation-problems.html' title='Java 6 compilation problems'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5749096302643540225</id><published>2010-10-26T21:07:00.001+02:00</published><updated>2010-10-26T22:17:13.501+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='column'/><category scheme='http://www.blogger.com/atom/ns#' term='pattern'/><category scheme='http://www.blogger.com/atom/ns#' term='finder'/><category scheme='http://www.blogger.com/atom/ns#' term='prototype'/><category scheme='http://www.blogger.com/atom/ns#' term='user interface'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='preview'/><title type='text'>Preview of the UI in DataCleaner 2</title><content type='html'>Lately I've been blogging a lot about AnalyzerBeans, which is the name of the new engine of DataCleaner from version 2.0 and onwards. As AnalyzerBeans is nearing a state where it is usable and maturing I have now also taken the first steps of development on the roadmap for DataCleaner 2. As a techie I would like to attribute as much emphasis on the technical capabilities of AnalyzerBeans as possible but honestly it doesn't do much good without a good user interface also. So just as AnalyzerBeans was/is an attempt to rewrite the functional/logical part of DataCleaner, the new UI will be an attempt to deliver an user experience that feels new, exciting, more responsive and interactive. The "sketches" for the new UI is being drawn these days - I'll take you through a few examples.&lt;br /&gt;&lt;br /&gt;In the two screenshots below you can see the source data selection and a transformation of this source data. The source selection is pretty similar to the existing DataCleaner UI but notice the new transformation-oriented features. In the example below I want to use a "Name standardizer" transformation which will turn my "real_name" column into four (virtual) columns: First name, Last name, Middle name and Titulation. Similarly I can convert data types, concatenate, tokenize, parse etc.&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://2.bp.blogspot.com/_UpvxZrigQfQ/TMcjBK7i9WI/AAAAAAAAAFg/YYXp355Hv2M/s1600/Screenshot-0.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="204" src="http://2.bp.blogspot.com/_UpvxZrigQfQ/TMcjBK7i9WI/AAAAAAAAAFg/YYXp355Hv2M/s320/Screenshot-0.png" width="320" /&gt;&lt;/a&gt;&lt;a href="http://1.bp.blogspot.com/_UpvxZrigQfQ/TMcjKjbQvhI/AAAAAAAAAFk/Kfg5R-JMsZg/s1600/Screenshot-1.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="239" src="http://1.bp.blogspot.com/_UpvxZrigQfQ/TMcjKjbQvhI/AAAAAAAAAFk/Kfg5R-JMsZg/s320/Screenshot-1.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;Another thing that is much needed in the existing DataCleaner UI is more elaborate configuration options for the various profiles. In the screenshot below you'll see the new and improved version of the &lt;b&gt;Pattern finder&lt;/b&gt; which includes a new set of configuration options. Notice that both my physical columns (real_name) and my virtual columns (as mentioned before) are available for the Pattern finder.&lt;br /&gt;&lt;div class="separator" style="clear: both; text-align: center;"&gt;&lt;a href="http://3.bp.blogspot.com/_UpvxZrigQfQ/TMcjQi9z8ZI/AAAAAAAAAFo/fZLxtMTztFo/s1600/Screenshot-2.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"&gt;&lt;img border="0" height="241" src="http://3.bp.blogspot.com/_UpvxZrigQfQ/TMcjQi9z8ZI/AAAAAAAAAFo/fZLxtMTztFo/s320/Screenshot-2.png" width="320" /&gt;&lt;/a&gt;&lt;/div&gt;There are a lot of other exciting things going into the new DataCleaner version but I will safe some news for later :) For now, I can only invite everyone to try it out. All you have to do is:&lt;br /&gt;&lt;blockquote&gt;&gt; mkdir datacleaner_dev&lt;br /&gt;&gt; cd datacleaner_dev&lt;br /&gt;&gt; svn co http://eobjects.org/svn/AnalyzerBeans/trunk AnalyzerBeans&lt;br /&gt;&gt; cd AnalyzerBeans&lt;br /&gt;&gt; mvn install&lt;br /&gt;&gt; cd ..&lt;br /&gt;&gt; svn co http://eobjects.org/svn/DataCleaner/trunk DataCleaner&lt;br /&gt;&gt; cd DataCleaner&lt;br /&gt;&gt; mvn install&lt;br /&gt;&gt; java -jar target/DataCleaner-2.0-SNAPSHOT.jar&lt;br /&gt;&lt;/blockquote&gt;Good luck and let us know what you think :-)&lt;br /&gt;&lt;br /&gt;PS: Maybe I should not that even though the new version &lt;i&gt;is&lt;/i&gt; usable there are still a lot of things &lt;i&gt;NOT&lt;/i&gt; working. If you're wondering if something odd is a bug or a feature that has simply not yet been implemented yet - don't hesitate to ask.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5749096302643540225?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5749096302643540225/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5749096302643540225' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5749096302643540225'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5749096302643540225'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/10/preview-of-ui-in-datacleaner-2.html' title='Preview of the UI in DataCleaner 2'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_UpvxZrigQfQ/TMcjBK7i9WI/AAAAAAAAAFg/YYXp355Hv2M/s72-c/Screenshot-0.png' height='72' width='72'/><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3491807514588963205</id><published>2010-09-26T11:24:00.007+02:00</published><updated>2010-10-01T18:19:55.598+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='weekdays'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzer'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzerbeans'/><category scheme='http://www.blogger.com/atom/ns#' term='api'/><category scheme='http://www.blogger.com/atom/ns#' term='development'/><category scheme='http://www.blogger.com/atom/ns#' term='implement'/><title type='text'>Developing an analyzer using the AnalyzerBeans Java API</title><content type='html'>&lt;img border="0" style="border: none; float: right; margin-left: 10px;" src="http://4.bp.blogspot.com/_UpvxZrigQfQ/TJ8SXrcil_I/AAAAAAAAAFQ/XYEIxGQPbGs/s1600/weekday-analyzer.png" /&gt;Previously I've posted about &lt;a href="http://kasper.eobjects.org/2010/09/developing-value-transformer-using.html"&gt;developing a value transformer using the AnalyzerBeans Java API&lt;/a&gt;. Now it's time to see how to develop an analyzer, which is a component for consuming data and turning it into a result that is humanly readable and hopefully useful. The Javadocs for the Java API are located &lt;a href="http://eobjects.org/analyzerbeans/apidocs/"&gt;here&lt;/a&gt;. There are lots of different analyzers in AnalyzerBeans already, which could be interesting to have a look at when you decide that you want to develop your own:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;For typical measures there are analyzers like the &lt;a href="http://eobjects.org/resources/view-doc.html?doc=/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/NumberAnalyzer.java"&gt;Number analyzer&lt;/a&gt; and &lt;a href="http://eobjects.org/resources/view-doc.html?doc=/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/StringAnalyzer.java"&gt;String analyzer&lt;/a&gt;. These analyzers calculate standardized measures for these data types.&lt;/li&gt;&lt;li&gt;There's the &lt;a href="http://eobjects.org/resources/view-doc.html?doc=/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/valuedist/ValueDistributionAnalyzer.java"&gt;Value distribution&lt;/a&gt; analyzer which is interesting because it uses a backing database (using the &lt;b&gt;@Provided&lt;/b&gt; annotation) for counting unique values if the values succeeds the amount of free memory.&lt;/li&gt;&lt;li&gt;The &lt;a href="http://eobjects.org/resources/view-doc.html?doc=/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/DateGapAnalyzer.java"&gt;Date gap analyzer&lt;/a&gt; is also a good example because it has named input columns, used for building a timeline of from- and to-dates.&lt;/li&gt;&lt;li&gt;The &lt;a href="http://eobjects.org/resources/view-doc.html?doc=/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/stringpattern/PatternFinderAnalyzer.java"&gt;Pattern finder&lt;/a&gt; analyzer which you can read a lot more about in &lt;a href="http://kasper.eobjects.org/2010/09/pattern-finder-20-latest-feature-in.html"&gt;one of my previous blog posts&lt;/a&gt;.&lt;/li&gt;&lt;/ul&gt;So let's begin with a simple example. Say you want to build a very simple analyzer that consumes date or time based values and determines the value distribution based on day-of-week (ie. how is the distribution of values grouped on monday, tuesday, wednesday etc.). While this is a rather naive example of an analyzer, it will work well as just that - an example.&lt;br /&gt;We'll begin with the requirements for building an analyzer:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;You need to define a class that implements either the &lt;a href="http://eobjects.org/analyzerbeans/apidocs/org/eobjects/analyzer/beans/RowProcessingAnalyzer.html"&gt;RowProcessingAnalyzer&lt;/a&gt;&lt;b&gt;&amp;lt;R&amp;gt;&lt;/b&gt; or &lt;a href="http://eobjects.org/analyzerbeans/apidocs/org/eobjects/analyzer/beans/ExploringAnalyzer.html"&gt;ExploringAnalyzer&lt;/a&gt;&lt;b&gt;&amp;lt;R&amp;gt;&lt;/b&gt; interface. The latter is an advanced (and less frequent) type of analyzer, so we'll stick with the first: A &lt;i&gt;row processing&lt;/i&gt; analyzer. &lt;b&gt;&amp;lt;R&amp;gt;&lt;/b&gt; defines the result type of the analyzer. We can reuse a built-in result-type or write our own.&lt;/li&gt;&lt;li&gt;The class needs to be annotated with the &lt;b&gt;@AnalyzerBean&lt;/b&gt; annotation. This annotation takes an argument: The display name of the analyzer.&lt;/li&gt;&lt;li&gt;You need to inject one or more &lt;a href="http://eobjects.org/analyzerbeans/apidocs/org/eobjects/analyzer/data/InputColumn.html"&gt;InputColumn&lt;/a&gt;&lt;b&gt;&amp;lt;E&amp;gt;&lt;/b&gt;'s using the &lt;b&gt;@Configured&lt;/b&gt; annotation in order to consume the incoming data. The &lt;b&gt;&amp;lt;E&amp;gt;&lt;/b&gt;&amp;nbsp;type-parameter defines the datatype of interest, which is also used to determine which kinds of data types the analyzer supports. In our case we'll use Date as the InputColumn type, because we want our analyzer to consume date values.&lt;/li&gt;&lt;/ul&gt;So here is our class when it has been created in accordance with the requirements above:&lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;@AnalyzerBean("Average date analyzer")&lt;br /&gt;public class AverageDateAnalyzer implements RowProcessingAnalyzer&amp;lt;CrosstabResult&amp;gt; {&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; @Configured&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; InputColumn&amp;lt;Date&amp;gt; dateColumn;&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; public void run(InputRow row, int distinctCount) { ... }&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; public CrosstabResult getResult() { ... }&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;Notice that we're using the built-in result type &lt;i&gt;CrosstabResult&lt;/i&gt;, which represents a result consisting of a dimensional crosstab. We could have used other built-in result types or we could have created our own result-class - the only requirement is that it implements the &lt;a href="http://eobjects.org/analyzerbeans/apidocs/org/eobjects/analyzer/result/AnalyzerResult.html"&gt;AnalyzerResult&lt;/a&gt; interface.&lt;br /&gt;&lt;br /&gt;The rest of the Analyzer should be "plain old Java" but of course using the API's that are available in AnalyzerBeans. I've explained &lt;a href="http://kasper.eobjects.org/2010/09/developing-value-transformer-using.html"&gt;most of these things before&lt;/a&gt;, but I'll go through it again.&lt;br /&gt;&lt;br /&gt;So now to consider how to implement the concrete analyzer logic. We'll use a regular map to hold the distribution values. We'll map the weekday numbers to counts in this map. But we'll need to keep a count for each column that we're analyzing, so it's going to be a nested map:&lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;private Map&amp;lt;InputColumn&amp;lt;Date&amp;gt;, Map&amp;lt;Integer, Integer&amp;gt;&amp;gt; distributionMap;&lt;br /&gt;&lt;/pre&gt;To initialize the map we need to have the InputColumn's injected first, so the constructor won't do. In stead we can annotate a method with the &lt;b&gt;@Initialize&lt;/b&gt; annotation, which will make AnalyzerBeans invoke the method when the bean has been properly initialized.&lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;@Initialize&lt;br /&gt;public void init() {&lt;br /&gt;&amp;nbsp; distributionMap = new HashMap&amp;lt;InputColumn&amp;lt;Date&amp;gt;, Map&amp;lt;Integer, Integer&amp;gt;&amp;gt;();&lt;br /&gt;&amp;nbsp; for (InputColumn&amp;lt;Date&amp;gt; col : dateColumns) {&lt;br /&gt;&amp;nbsp; &amp;nbsp; Map&amp;lt;Integer, Integer&amp;gt; countMap = new HashMap&amp;lt;Integer, Integer&amp;gt;(7);&lt;br /&gt;&amp;nbsp; &amp;nbsp; for (int i = Calendar.SUNDAY; i &amp;lt;= Calendar.SATURDAY; i++) {&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; // put a count of 0 for each day of the week&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; countMap.put(i, 0);&lt;br /&gt;&amp;nbsp; &amp;nbsp; }&lt;br /&gt;&amp;nbsp; &amp;nbsp; distributionMap.put(col, countMap);&lt;br /&gt;&amp;nbsp; }&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;Now that the map has been initialized we can proceed to implement the run(...) method: &lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;@Override&lt;br /&gt;public void run(InputRow row, int distinctCount) {&lt;br /&gt;&amp;nbsp; for (InputColumn&amp;lt;Date&amp;gt; col : dateColumns) {&lt;br /&gt;&amp;nbsp; &amp;nbsp; Date value = row.getValue(col);&lt;br /&gt;&amp;nbsp; &amp;nbsp; if (value != null) {&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; Calendar c = Calendar.getInstance();&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; c.setTime(value);&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; int dayOfWeek = c.get(Calendar.DAY_OF_WEEK);&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; Map&amp;lt;Integer, Integer&amp;gt; countMap = distributionMap.get(col);&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; int count = countMap.get(dayOfWeek);&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; count += distinctCount;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; countMap.put(dayOfWeek, count);&lt;br /&gt;&amp;nbsp; &amp;nbsp; }&lt;br /&gt;&amp;nbsp; }&lt;br /&gt;}&lt;br /&gt;&lt;/pre&gt;This should be pretty much "Java as usual". The only thing that should be new to you if you're an experienced Java developer is the way you extract values from the InputRow using the InputColumns as qualifiers: &lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;Date value = row.getValue(col);&lt;/pre&gt;Notice that the &lt;b&gt;value&lt;/b&gt; variable has the Date type. The AnalyzerBeans API takes advantage of type-safety to a large extent. Since the injected InputColumn's are defined as Date-columns this means that we can safely assume that the values in the incoming row is also of the Date-type. Furthermore the Date-column will be used to verify the configuration of AnalyzerBeans jobs and early error messages to the user if he tries to configure this particular Analyzer with a non-Date column. Now on to creating the result. As stated earlier we will use the &lt;i&gt;CrosstabResult&lt;/i&gt; for this. The crosstab result is a pretty dynamic result type that can be used for a lot of purposes. It's metaphor is similar to DataCleaners result matrices but with added features. Here's how we build our crosstab: &lt;br /&gt;&lt;pre class="prettyprint lang-java"&gt;@Override&lt;br /&gt;public CrosstabResult getResult() {&lt;br /&gt;&amp;nbsp; CrosstabDimension columnDimension = new CrosstabDimension("Column");&lt;br /&gt;&amp;nbsp; CrosstabDimension weekdayDimension = new CrosstabDimension("Weekday");&lt;br /&gt;&amp;nbsp; weekdayDimension.addCategory("Sunday").addCategory("Monday")&lt;br /&gt;&amp;nbsp; &amp;nbsp; .addCategory("Tuesday").addCategory("Wednesday").addCategory("Thursday")&lt;br /&gt;&amp;nbsp; &amp;nbsp; .addCategory("Friday").addCategory("Saturday");&lt;br /&gt;&lt;br /&gt;&amp;nbsp; Crosstab&lt;integer&gt; crosstab = new Crosstab&lt;integer&gt;(Integer.class, columnDimension, weekdayDimension);&lt;br /&gt;&amp;nbsp; for (InputColumn&lt;date&gt; col : dateColumns) {&lt;br /&gt;&amp;nbsp; &amp;nbsp; columnDimension.addCategory(col.getName());&lt;br /&gt;&amp;nbsp; &amp;nbsp; CrosstabNavigator&lt;integer&gt; nav = crosstab.where(columnDimension, col.getName());&lt;br /&gt;&amp;nbsp; &amp;nbsp; Map&lt;integer, integer=""&gt; countMap = distributionMap.get(col);&lt;br /&gt;&amp;nbsp; &amp;nbsp; nav.where(weekdayDimension, "Sunday").put(countMap.get(Calendar.SUNDAY));&lt;br /&gt;&amp;nbsp; &amp;nbsp; nav.where(weekdayDimension, "Monday").put(countMap.get(Calendar.MONDAY));&lt;br /&gt;&amp;nbsp; &amp;nbsp; nav.where(weekdayDimension, "Tuesday").put(countMap.get(Calendar.TUESDAY));&lt;br /&gt;&amp;nbsp; &amp;nbsp; nav.where(weekdayDimension, "Wednesday").put(countMap.get(Calendar.WEDNESDAY));&lt;br /&gt;&amp;nbsp; &amp;nbsp; nav.where(weekdayDimension, "Thursday").put(countMap.get(Calendar.THURSDAY));&lt;br /&gt;&amp;nbsp; &amp;nbsp; nav.where(weekdayDimension, "Friday").put(countMap.get(Calendar.FRIDAY));&lt;br /&gt;&amp;nbsp; &amp;nbsp; nav.where(weekdayDimension, "Saturday").put(countMap.get(Calendar.SATURDAY));&lt;br /&gt;&amp;nbsp; }&lt;br /&gt;&amp;nbsp; return new CrosstabResult(getClass(), crosstab);&lt;br /&gt;}&lt;br /&gt;&lt;/integer,&gt;&lt;/integer&gt;&lt;/date&gt;&lt;/integer&gt;&lt;/integer&gt;&lt;/pre&gt;Now we're done. You can take a look at the final result &lt;a href="http://eobjects.org/resources/view-doc.html?doc=/svn//AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/valuedist/WeekdayDistributionAnalyzer.java"&gt;here&lt;/a&gt;. When I run this analyzer with a small sample of data in three columns the result looks like this: &lt;br /&gt;&lt;pre&gt;             Order date Shipment date Delivery date&lt;br /&gt;Sunday                0             0             0&lt;br /&gt;Monday                2             0             1&lt;br /&gt;Tuesday               0             2             1&lt;br /&gt;Wednesday             0             0             0&lt;br /&gt;Thursday              1             0             0&lt;br /&gt;Friday                1             1             2&lt;br /&gt;Saturday              0             1             0&lt;br /&gt;&lt;/pre&gt;You can also check out the unit test for this analyzer &lt;a href="http://eobjects.org/resources/view-doc.html?doc=/svn//AnalyzerBeans/trunk/src/test/java/org/eobjects/analyzer/beans/valuedist/WeekdayDistributionAnalyzerTest.java"&gt;here&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3491807514588963205?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3491807514588963205/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3491807514588963205' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3491807514588963205'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3491807514588963205'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/09/developing-analyzer-using-analyzerbeans.html' title='Developing an analyzer using the AnalyzerBeans Java API'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_UpvxZrigQfQ/TJ8SXrcil_I/AAAAAAAAAFQ/XYEIxGQPbGs/s72-c/weekday-analyzer.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2731541832617236521</id><published>2010-09-25T19:03:00.008+02:00</published><updated>2010-09-26T10:58:01.556+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='look and feel'/><category scheme='http://www.blogger.com/atom/ns#' term='repository'/><category scheme='http://www.blogger.com/atom/ns#' term='svn'/><category scheme='http://www.blogger.com/atom/ns#' term='xslt'/><category scheme='http://www.blogger.com/atom/ns#' term='css'/><category scheme='http://www.blogger.com/atom/ns#' term='browser'/><title type='text'>Improved browsing of eobjects.org Subversion repository</title><content type='html'>10 minutes ago I stumbled upon &lt;a href="http://reposstyle.com/"&gt;Repos style&lt;/a&gt;, an XSLT-based plugin for improving the visual browsing experience of Subversion repositories. It took literally 5 minutes to install and works great. Thank you Repos style :) Take a look at the beautiful result:&lt;br /&gt;&lt;div style="text-align: center;"&gt;&lt;img border="0" style="margin-left: auto; margin-right: auto;" src="http://1.bp.blogspot.com/_UpvxZrigQfQ/TJ5LFUosDCI/AAAAAAAAAFM/0k6jlsUya40/s1600/eobjects-svn-repos-style.png" /&gt;&lt;/div&gt;Besides beautifying the browsing experience it also adds a valuable history view. If you click the "view history" links you'll be able to track the history of individual folders and files.&lt;br /&gt;&lt;br /&gt;See it yourself at &lt;a href="http://eobjects.org/svn/"&gt;http://eobjects.org/svn/&lt;/a&gt;...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-2731541832617236521?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2731541832617236521/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=2731541832617236521' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2731541832617236521'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2731541832617236521'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/09/improved-browsing-of-eobjectsorg.html' title='Improved browsing of eobjects.org Subversion repository'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/_UpvxZrigQfQ/TJ5LFUosDCI/AAAAAAAAAFM/0k6jlsUya40/s72-c/eobjects-svn-repos-style.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1780521207727277419</id><published>2010-09-13T16:23:00.013+02:00</published><updated>2010-09-25T16:22:57.219+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzerbeans'/><category scheme='http://www.blogger.com/atom/ns#' term='recognition'/><category scheme='http://www.blogger.com/atom/ns#' term='identify'/><category scheme='http://www.blogger.com/atom/ns#' term='finder'/><category scheme='http://www.blogger.com/atom/ns#' term='matching'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='csv'/><category scheme='http://www.blogger.com/atom/ns#' term='expression'/><category scheme='http://www.blogger.com/atom/ns#' term='regular'/><category scheme='http://www.blogger.com/atom/ns#' term='pattern'/><category scheme='http://www.blogger.com/atom/ns#' term='string'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><title type='text'>Pattern finder 2.0 - the latest feature in AnalyzerBeans</title><content type='html'>&lt;img border="0" style="float: right; border: none;" src="http://1.bp.blogspot.com/_UpvxZrigQfQ/TI42C8bBtXI/AAAAAAAAAFA/kf1_ncKknPQ/s320/pattern-finder-icon.png" /&gt;&lt;br /&gt;&lt;br /&gt;I'm happy to be able to present a feature in this blog post that I know a lot of you have been asking for: A new and improved "Pattern finder" (as known in DataCleaner). Currently it lives within AnalyzerBeans (which only has a command line interface) but it will be rather easy to convert it to the "old" DataCleaner as well.&lt;br /&gt;&lt;br /&gt;The new Pattern finder works similarly to the old one. The new thing is that it supports a wide variety of configuration options (and it has been designed so that it will be significantly easier to add more options, if needed). Here are the current available options:&lt;br /&gt;&lt;br /&gt;&lt;table class="pretty-table" border="0" cellspacing="0" cellpadding="0"&gt;&lt;tbody&gt;&lt;tr&gt; &lt;th width="24%"&gt;Property&lt;/th&gt;&lt;th width="16%"&gt;Default value&lt;/th&gt;&lt;th width="60%"&gt;Description&lt;/th&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Discriminate text case&lt;/td&gt; &lt;td&gt;true&lt;/td&gt; &lt;td&gt;Sets whether or not text tokens that are upper case and lower case should be treated as different types of tokens.&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Discriminate negative numbers&lt;/td&gt; &lt;td&gt;false&lt;/td&gt; &lt;td&gt;Sets whether or not negative numbers should be treated as different token types than positive numbers.&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Discriminate decimals&lt;/td&gt; &lt;td&gt;true&lt;/td&gt; &lt;td&gt;Sets whether or not decimal numbers should be treated as different token types than integers.&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Enable mixed tokens&lt;/td&gt; &lt;td&gt;true&lt;/td&gt; &lt;td&gt;Enables the "mixed" token type (denoted as '?' output). This type of token will occur when numbers and letters occur without separation of whitespaces.&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Ignore repeated spaces&lt;/td&gt; &lt;td&gt;false&lt;/td&gt; &lt;td&gt;Sets whether or not repeated whitespaces should be ignored (ie. matched with single whitespaces)&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Decimal separator&lt;/td&gt; &lt;td&gt;,&lt;span class="Apple-style-span" style="color: blue;"&gt;*&lt;/span&gt;&lt;/td&gt; &lt;td&gt;The separator used to identify decimal numbers.&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Thousands separator&lt;/td&gt; &lt;td&gt;.&lt;span class="Apple-style-span" style="color: blue;"&gt;*&lt;/span&gt;&lt;/td&gt; &lt;td&gt;The character used as a thousands separator in large numbers.&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Minus sign&lt;/td&gt; &lt;td&gt;-&lt;span class="Apple-style-span" style="color: blue;"&gt;*&lt;/span&gt;&lt;/td&gt; &lt;td&gt;The character used to denote negative numbers.&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Predefined token name&lt;/td&gt; &lt;td&gt;(none)&lt;/td&gt; &lt;td&gt;Can be used to define an anticipated "predefined token" that should be replaced before any subsequent pattern recognition. Requires that the "Predefined token regexes" property is also set. An example of a name could be "Titulation"&lt;/td&gt; &lt;/tr&gt;&lt;tr&gt; &lt;td&gt;Predefined token regexes&lt;/td&gt; &lt;td&gt;(none)&lt;/td&gt; &lt;td&gt;Defines a set of regular expressions for the "predefined token". Requires that the "Predefined token name" property is also set. An example value for these regular expressions could be "[Mr,Mrs,Miss,Mister]" (which would correspond to the "Titulation" name).&lt;/td&gt; &lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;&lt;br /&gt;&lt;span class="Apple-style-span" style="color: blue;"&gt;*&lt;/span&gt; = Depending on locale, the shown value is the typical one.&lt;br /&gt;&lt;br /&gt;This may all seem complicated, but rest assured that the default values are reasonable and almost exactly resembles what you would expect from the Pattern finder in DataCleaner (except for the "Discriminate text case" property, which is inherently turned off in DataCleaner).&lt;br /&gt;&lt;br /&gt;Here's how it works with a set of different inputs (job title, email, name) and &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/examples/patternfinder_job.xml"&gt;configurations&lt;/a&gt;:&lt;br /&gt;&lt;pre&gt;&amp;gt; java -jar target/AnalyzerBeans.jar -conf &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/examples/conf.xml"&gt;examples/conf.xml&lt;/a&gt; -job &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/examples/patternfinder_job.xml"&gt;examples/patternfinder_job.xml &lt;/a&gt;&lt;br /&gt;&lt;br /&gt;RESULT:&lt;br /&gt;                            Match count Sample      &lt;br /&gt;Aaaaa Aaa                            17 Sales Rep   &lt;br /&gt;AA Aaaaaaaaa                          2 VP Sales    &lt;br /&gt;Aaaaa Aaaaaaa (AAAA)                  2 Sale Manager (EMEA) &lt;br /&gt;Aaaaa Aaaaaaa (AAAAA, AAAA)           1 Sales Manager (JAPAN, APAC) &lt;br /&gt;Aaaaaaaaa                             1 President   &lt;br /&gt;&lt;br /&gt;&lt;br /&gt;RESULT:&lt;br /&gt;                                  Match count Sample      &lt;br /&gt;aaaaaa.aaa@aaaaaaa[Domain suffix]           4 foo.bar@company.com &lt;br /&gt;aaaaaaa@aaaaaaaa[Domain suffix]             3 santa@claus.com &lt;br /&gt;&lt;br /&gt;&lt;br /&gt;RESULT:&lt;br /&gt;                 Match count Sample      &lt;br /&gt;aaaaaaa aaaaa              4 Jane Doe    &lt;br /&gt;aaaaaaaa, aaaaaa           2 Bar, Foo    &lt;br /&gt;aaa. aaaaaa aaa            1 Mrs. Foobar Foo&lt;br /&gt;&lt;/pre&gt;Notice that in the email example the two patterns end with "[Domain suffix]". This is because I've registered a corresponding "Predefined token" for this:&lt;br /&gt;&lt;pre class="prettyprint lang-xml"&gt;&amp;lt;analyzer&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;descriptor ref="Pattern finder" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;properties&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;lt;property name="Predefined token name" value="Domain suffix" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;lt;property name="Predefined token regexes" value="[\.com,\.org]" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;/properties&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;input ref="col_email" /&amp;gt;&lt;br /&gt;&amp;lt;/analyzer&amp;gt;&lt;/pre&gt;So now that you've seen the new Pattern finder... Does it meet all your expectations? Let me know if you've got any ideas or unresolved issues!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1780521207727277419?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1780521207727277419/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1780521207727277419' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1780521207727277419'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1780521207727277419'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/09/pattern-finder-20-latest-feature-in.html' title='Pattern finder 2.0 - the latest feature in AnalyzerBeans'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/_UpvxZrigQfQ/TI42C8bBtXI/AAAAAAAAAFA/kf1_ncKknPQ/s72-c/pattern-finder-icon.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-6223114906328105807</id><published>2010-09-12T11:40:00.006+02:00</published><updated>2010-09-12T12:09:41.171+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='promotion'/><category scheme='http://www.blogger.com/atom/ns#' term='discussion'/><category scheme='http://www.blogger.com/atom/ns#' term='features'/><category scheme='http://www.blogger.com/atom/ns#' term='development'/><category scheme='http://www.blogger.com/atom/ns#' term='community'/><category scheme='http://www.blogger.com/atom/ns#' term='linkedin'/><category scheme='http://www.blogger.com/atom/ns#' term='group'/><category scheme='http://www.blogger.com/atom/ns#' term='member'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Join the DataCleaner group at LinkedIn</title><content type='html'>&lt;div style="width: 400px; float: left; margin-right: 20px;"&gt;I've opened up a new &lt;a href="http://www.linkedin.com/groups?gid=3352784"&gt;LinkedIn group for DataCleaner&lt;/a&gt; and I would like to invite anyone with an interest in DataCleaner and open source data quality to join.&lt;br /&gt;&lt;br /&gt;If you've read my blog lately you will know that we are currently in heavy development of a new engine for the application (called "AnalyzerBeans") and the groups focus right now is also on this development. But it is also for sharing experience, discussing features, issues and solutions.&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.linkedin.com/groups?gid=3352784"&gt;Join the group&lt;/a&gt; to help us gather a bit of traction for the project.&lt;/div&gt;&lt;a href="http://www.linkedin.com/groups?gid=3352784"&gt;&lt;img border="0" src="http://1.bp.blogspot.com/_UpvxZrigQfQ/TIycU48PQVI/AAAAAAAAAE4/RfeBK0anIK4/s200/linkedin-button.png" style="float: left; border:none" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;div style="clear:both"&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-6223114906328105807?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/6223114906328105807/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=6223114906328105807' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/6223114906328105807'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/6223114906328105807'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/09/join-datacleaner-group-at-linkedin.html' title='Join the DataCleaner group at LinkedIn'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://1.bp.blogspot.com/_UpvxZrigQfQ/TIycU48PQVI/AAAAAAAAAE4/RfeBK0anIK4/s72-c/linkedin-button.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1389466294310639460</id><published>2010-09-11T22:32:00.001+02:00</published><updated>2010-09-13T15:27:05.809+02:00</updated><title type='text'>More instructions for authoring AnalyzerBeans jobs</title><content type='html'>I've previously posted a blog entry about how you could now &lt;a href="http://kasper.eobjects.org/2010/08/now-you-can-run-analyzerbeans-from.html"&gt;download and run a simple example of AnalyzerBeans&lt;/a&gt; in the shell. I've updated the example and improved the command-line interface so that it will further assist you if you are interested in using the tool.&lt;br /&gt;&lt;br /&gt;First of all, the command-line tool now has a reasonable usage screen:&lt;br /&gt;&lt;blockquote&gt;&amp;gt; java -jar target/AnalyzerBeans.jar&lt;br /&gt;-conf (-configuration, --configuration-file) FILE&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; : XML file describing the configuration of AnalyzerBeans&lt;br /&gt;-ds (-datastore, --datastore-name) VAL&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; : Name of datastore when printing a list of schemas, tables or columns&lt;br /&gt;-job (--job-file) FILE&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; : An analysis job XML file to execute&lt;br /&gt;-list [ANALYZERS | TRANSFORMERS | DATASTORES | SCHEMAS | TABLES | COLUMNS ]&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; : Used to print a list of various elements available in the configuration&lt;br /&gt;-s (-schema, --schema-name) VAL&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; : Name of schema when printing a list of tables or columns&lt;br /&gt;-t (-table, --table-name) VAL&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; : Name of table when printing a list of columns&lt;/blockquote&gt;&lt;br /&gt;As you can see, you can now for example list all available analyzers (there are a lot, so I'm only posting the relevant parts for my up-coming example here, the rest have been replaced with "..."):&lt;br /&gt;&lt;blockquote&gt;&amp;gt; java -jar target/AnalyzerBeans.jar -conf examples/conf.xml -list ANALYZERS&lt;br /&gt;Analyzers:&lt;br /&gt;----------&lt;br /&gt;...&lt;br /&gt;name: String analyzer&lt;br /&gt;- Consumes multiple input columns&lt;br /&gt;...&lt;br /&gt;name: Value distribution&lt;br /&gt;- Consumes a single input column&lt;br /&gt;- Property: name=Record unique values, type=boolean, required=true&lt;br /&gt;- Property: name=Bottom n most frequent values, type=Integer, required=false&lt;br /&gt;- Property: name=Top n most frequent values, type=Integer, required=false&lt;br /&gt;...&lt;br /&gt;name: Number analyzer&lt;br /&gt;- Consumes multiple input columns&lt;/blockquote&gt;&lt;br /&gt;I'll help you read this output: There are three analyzers listed. The String analyzer and Number analyzer both consume multiple columns, which means that they can be configured to have multiple inputs. Value distribution is another analyzer which only consumes a single column and has three configurable properties: Record unique values, Bottom n most frequent values and Top n most frequent values.&lt;br /&gt;&lt;br /&gt;You can similarly list available transformers:&lt;br /&gt;&lt;blockquote&gt;&amp;gt; java -jar target/AnalyzerBeans.jar -conf examples/conf.xml -list TRANSFORMERS&lt;br /&gt;...&lt;/blockquote&gt;or datastores:&lt;br /&gt;&lt;blockquote&gt;&amp;gt; java -jar target/AnalyzerBeans.jar -conf examples/conf.xml -list DATASTORES&lt;br /&gt;...&lt;/blockquote&gt;or tables and columns in a particular datastore&lt;br /&gt;&lt;blockquote&gt;&amp;gt; java -jar target/AnalyzerBeans.jar -conf examples/conf.xml -ds employees_csv -list TABLES&lt;br /&gt;...&lt;br /&gt;&amp;gt; java -jar target/AnalyzerBeans.jar -conf examples/conf.xml -ds employees_csv -t employees -list COLUMNS&lt;br /&gt;...&lt;br /&gt;&lt;/blockquote&gt;So now you have all the details that enable you to author an XML-based AnalyzerBeans job yourself. Let's take a look at the example. I'm going to post a few snippets from the &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/examples/employees_job.xml"&gt;employees_job.xml&lt;/a&gt; file which I also used in my previous post. Notice that this file has been updated since my last post so you will need to run an "&lt;span style="font-family: monospace; color: green;"&gt;&lt;b&gt;svn update&lt;/b&gt;&lt;/span&gt;" if you followed my previous tutorial, in order to get up-to-date code and data.&lt;br /&gt;&lt;br /&gt;The file starts up with a little metadata. We're not going into detail with that. Then there's the &amp;lt;source&amp;gt; part:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;source&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;lt;data-context ref="&lt;span style="color: green;"&gt;employees_csv&lt;/span&gt;" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;lt;columns&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;lt;column id="&lt;span style="color: red;"&gt;col_name&lt;/span&gt;" path="&lt;span style="color: green;"&gt;employees.csv.employees.name&lt;/span&gt;" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;lt;column id="&lt;span style="color: red;"&gt;col_email&lt;/span&gt;" path="&lt;span style="color: green;"&gt;employees.csv.employees.email&lt;/span&gt;" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;lt;column id="&lt;span style="color: red;"&gt;col_birthdate&lt;/span&gt;" path="&lt;span style="color: green;"&gt;employees.birthdate&lt;/span&gt;" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;lt;/columns&amp;gt;&lt;br /&gt;&amp;lt;/source&amp;gt;&lt;br /&gt;&lt;/blockquote&gt;The content is almost self-explanatory. There's a reference to the &lt;span style="color: green;"&gt;employees_csv&lt;/span&gt; datastore and the three columns defined in the CSV file: &lt;span style="color: green"&gt;name, email, birthdate&lt;/span&gt;. Notice the id's (marked in &lt;span style="color: red;"&gt;red&lt;/span&gt;) of these three columns. These id's will be referenced further down in the XML file.&lt;br /&gt;&lt;br /&gt;The next major part of the XML file is the transformation part. Let's have a look at one of the transformations:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;transformer&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;descriptor ref="&lt;span style="color: blue;"&gt;Email standardizer&lt;/span&gt;" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;input ref="&lt;span style="color: red;"&gt;col_email&lt;/span&gt;" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;output id="&lt;span style="color: red;"&gt;col_username&lt;/span&gt;" name="Email username" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;output id="&lt;span style="color: red;"&gt;col_domain&lt;/span&gt;" name="Email domain" /&amp;gt;&lt;br /&gt;&amp;lt;/transformer&amp;gt;&lt;/blockquote&gt;This snippet defines that the &lt;span style="color: blue;"&gt;Email standardizer&lt;/span&gt; transformer consumes a single column (&lt;span style="color: red;"&gt;col_email&lt;/span&gt;) and generates two new virtual columns: &lt;span style="color: red;"&gt;col_username&lt;/span&gt; and &lt;span style="color: red;"&gt;col_domain&lt;/span&gt;. Now understanding the final part of the XML file will be pretty obvious. Let's have a look at one of the analyzers defined in the &amp;lt;analysis&amp;gt; part:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;analyzer&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;descriptor ref="&lt;span style="color: blue"&gt;Value distribution&lt;/span&gt;" /&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;input ref="&lt;span style="color: red"&gt;col_username&lt;/span&gt;" /&amp;gt;&lt;br /&gt;&amp;lt;/analyzer&amp;gt;&lt;br /&gt;&lt;/blockquote&gt;It simply maps the (virtual) &lt;span style="color: red"&gt;col_username&lt;/span&gt; column to a &lt;span style="color: blue"&gt;Value distribution&lt;/span&gt; analyzer which is then executed (along with all the other analyzers defined in the file) when you run the job from the command line:&lt;br /&gt;&lt;blockquote&gt;&amp;gt; java -jar target/AnalyzerBeans.jar -conf examples/conf.xml -job examples/employees_job.xml&lt;br /&gt;...&lt;br /&gt;Value distribution for column: Email username&lt;br /&gt;Null count: 0&lt;br /&gt;Unique values: &lt;br /&gt;- asbjorn&lt;br /&gt;- foo.bar&lt;br /&gt;- foobar.foo&lt;br /&gt;- jane.doe&lt;br /&gt;- john.doe&lt;br /&gt;- kasper&lt;br /&gt;- santa&lt;br /&gt;...&lt;/blockquote&gt;I hope that you find this XML format pretty straight forward to author. Of course we will be implementing a graphical user interface as well, but for the moment I am actually quite satisfied with this early user interface.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1389466294310639460?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1389466294310639460/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1389466294310639460' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1389466294310639460'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1389466294310639460'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/09/more-instructions-for-authoring.html' title='More instructions for authoring AnalyzerBeans jobs'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1742218048692558063</id><published>2010-09-07T13:05:00.020+02:00</published><updated>2010-09-25T16:13:16.515+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='transformer'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzerbeans'/><category scheme='http://www.blogger.com/atom/ns#' term='value'/><category scheme='http://www.blogger.com/atom/ns#' term='api'/><category scheme='http://www.blogger.com/atom/ns#' term='generated'/><category scheme='http://www.blogger.com/atom/ns#' term='date'/><category scheme='http://www.blogger.com/atom/ns#' term='age'/><category scheme='http://www.blogger.com/atom/ns#' term='tokenize'/><category scheme='http://www.blogger.com/atom/ns#' term='flow'/><category scheme='http://www.blogger.com/atom/ns#' term='convert'/><title type='text'>Developing a value transformer using the AnalyzerBeans Java API</title><content type='html'>&lt;p&gt;In this blog-entry I will demonstrate the &lt;a href="http://eobjects.org/analyzerbeans/apidocs/"&gt;Java API of AnalyzerBeans&lt;/a&gt; to create transformers, ie. components for transforming/converting/tokenizing/generating new values based on the existing values of a dataset. You &lt;i&gt;will&lt;/i&gt; need Java programming skills to follow this tutorial.&lt;/p&gt;&lt;p&gt;I find that the easiest way to explain this process is by running an example. So here's my case: I want to transform birthdates of persons (represented as Date fields) into age fields (represented as a number field). A scenario is depicted below:&lt;/p&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;" src="http://4.bp.blogspot.com/_UpvxZrigQfQ/TIYc0y50IuI/AAAAAAAAAEk/tN4S9rDmc50/s400/DateToAgeTransformer.jpg" border="0" /&gt;&lt;br /&gt;&lt;p&gt;After the transformation I will be able to independently process the age field, eg. with a number analysis, value distribution or apply some business rule that depends on age.&lt;/p&gt;&lt;p&gt;The requirements for building a transformer class are the following:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;The class must implement the &lt;b&gt;Transformer&amp;lt;E&amp;gt;&lt;/b&gt; interface, where &amp;lt;E&amp;gt; is the data type of the generated fields. In our case we will therefore implement &lt;span style="color: green"&gt;Transformer&amp;lt;Integer&amp;gt;&lt;/span&gt;.&lt;/li&gt;&lt;li&gt;The class must be annotated with the &lt;b&gt;@TransformerBean&lt;/b&gt; annotation. The annotation takes an argument: The readable name of the transformer. We will thusly annotate: &lt;span style="color: green"&gt;@TransformerBean(”Date to age”)&lt;/span&gt;&lt;/li&gt;&lt;li&gt;In order to read from the incoming fields we need to inject an &lt;b&gt;InputColumn&amp;lt;E&amp;gt;&lt;/b&gt; instance (or alternatively an array of these), where &amp;lt;E&amp;gt; is the data type of the incoming fields. To inject we use the &lt;b&gt;@Configured&lt;/b&gt; annotation. In our example this translates to: &lt;span style="color: green"&gt;@Configured InputColumn&amp;lt;Date&amp;gt; dateColumn;&lt;/span&gt;&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;After these steps our code will look something like this:&lt;/p&gt;&lt;pre class="prettyprint lang-java"&gt;&lt;b&gt;@TransformerBean("Date to age")&lt;/b&gt;&lt;br /&gt;public class DateToAgeTransformer implements &lt;b&gt;Transformer&amp;lt;&lt;span style="color: red"&gt;Integer&lt;/span&gt;&amp;gt;&lt;/b&gt; {&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &lt;b&gt;@Configured&lt;/b&gt;&lt;br /&gt;&amp;nbsp; InputColumn&amp;lt;Date&amp;gt; dateColumn;&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &lt;i&gt;@Override&lt;/i&gt;&lt;br /&gt;&amp;nbsp; public OutputColumns getOutputColumns() {&lt;br /&gt;&amp;nbsp; &amp;nbsp; // TODO&lt;br /&gt;&amp;nbsp; &amp;nbsp; return null;&lt;br /&gt;&amp;nbsp; }&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &lt;i&gt;@Override&lt;/i&gt;&lt;br /&gt;&amp;nbsp; public &lt;span style="color: red"&gt;Integer&lt;/span&gt;[] transform(InputRow inputRow) {&lt;br /&gt;&amp;nbsp; &amp;nbsp; // TODO&lt;br /&gt;&amp;nbsp; &amp;nbsp; return null;&lt;br /&gt;&amp;nbsp; }&lt;br /&gt;}&lt;/pre&gt;&lt;p&gt;As we see, there are two methods defined by the Transformer&lt;integer&gt; interface, that we need to implement. They are:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;&lt;b&gt;getOutputColumns():&lt;/b&gt; This method is called by the framework to determine which virtual columns will be produced by the transformer. In our case it is quite simple: The transformer creates virtual columns for age (both in days and in years, just to make it more flexible). The method body should therefore just be:&lt;pre class="prettyprint lang-java"&gt;return new OutputColumns("Age in days", "Age in years");&lt;/pre&gt;&lt;/li&gt;&lt;li&gt;&lt;b&gt;transform(InputRow):&lt;/b&gt; This method will be called for each row with values to be transformed. The return type of the method is an Integer-array because we chose to implement Transformer&amp;lt;Integer&amp;gt;. The indexes of the returned array should match the output columns, ie. index 0 is for ”Age in days” and index 1 is for ”Age in years”. Let's have a look at the methods implementation:&lt;pre class="prettyprint lang-java"&gt;Integer[] result = new Integer[2];&lt;br /&gt;Date date = inputRow.getValue(dateColumn);&lt;br /&gt;&lt;br /&gt;if (date != null) {&lt;br /&gt;&amp;nbsp; long diffMillis = today.getTime() - date.getTime();&lt;br /&gt;&amp;nbsp; int diffDays = (int) (diffMillis / (1000 * 60 * 60 * 24));&lt;br /&gt;&lt;br /&gt;&amp;nbsp; result[0] = diffDays;&lt;br /&gt;&lt;br /&gt;&amp;nbsp; // use Joda time to easily calculate the diff in years&lt;br /&gt;&amp;nbsp; int diffYears = Years.yearsBetween(new DateTime(date), new DateTime(today)).getYears();&lt;br /&gt;&amp;nbsp;  result[1] = diffYears;&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;return result;&lt;/pre&gt;&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;Of course I didn't do all the work of writing this tutorial without checking in the code so you could try it in action. The code for the ”Date to age” transformer is available &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/DateToAgeTransformer.java"&gt;here&lt;/a&gt; and there's also a unittest available &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/src/test/java/org/eobjects/analyzer/beans/DateToAgeTransformerTest.java"&gt;here&lt;/a&gt;, that is pretty usable as a demonstration of how to unittest transformers. I hope some of you engage in developing transformers and let me know how it turns out. In my next blog post I'll explain how to build Analyzers which are the obvious next step when developing components for AnalyzerBeans.&lt;/p&gt;&lt;p&gt;There are a few other good examples of transformers that might be of interest:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;The &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/ConvertToDateTransformer.java"&gt;Convert to date&lt;/a&gt; transformer which will try to convert any value to a date. This is perhaps useful in combination with the transformer that I've just explained in this tutorial. In other words: These two transformers may need to be chained if for example the birth date to be transformed is stored in a String-based field.&lt;/li&gt;&lt;li&gt;The &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/TokenizerTransformer.java"&gt;Tokenizer&lt;/a&gt; transformer because it has a flexible amount of output columns based on the users configuration. Notice the &lt;span style="color: green"&gt;@Configured Integer numTokens&lt;/span&gt; that is used in the getOutputColumns() for this purpose.&lt;/li&gt;&lt;li&gt;The &lt;a href="http://eobjects.org/svn/AnalyzerBeans/trunk/src/main/java/org/eobjects/analyzer/beans/EmailStandardizerTransformer.java"&gt;Email standarizer&lt;/a&gt; transformer, because it makes use of the NamedPattern object, which I have previously introduced in my blog entry "&lt;a href="http://kasper.eobjects.dk/2010/08/nice-abstraction-over-regular.html"&gt;A nice abstraction over regular expressions&lt;/a&gt;".&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1742218048692558063?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1742218048692558063/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1742218048692558063' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1742218048692558063'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1742218048692558063'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/09/developing-value-transformer-using.html' title='Developing a value transformer using the AnalyzerBeans Java API'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_UpvxZrigQfQ/TIYc0y50IuI/AAAAAAAAAEk/tN4S9rDmc50/s72-c/DateToAgeTransformer.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1380841442477610563</id><published>2010-08-29T12:26:00.012+02:00</published><updated>2010-09-13T15:27:26.413+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='transformer'/><category scheme='http://www.blogger.com/atom/ns#' term='example'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzerbeans'/><category scheme='http://www.blogger.com/atom/ns#' term='analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='cli'/><category scheme='http://www.blogger.com/atom/ns#' term='command line'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Now you can run AnalyzerBeans (from the shell)</title><content type='html'>&lt;p&gt;Lately I've been blabbering a lot about the marvels of &lt;a href="http://eobjects.org/trac/wiki/AnalyzerBeans"&gt;AnalyzerBeans&lt;/a&gt; - the project that is aimed at re-implementing an engine for data analysis based on my experience from &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;An important milestone in any development project, especially those like AnalyzerBeans, that are implemented bottom-up, is when it is actually possible to &lt;i&gt;use&lt;/i&gt; the application without having any developer skills. So far the development of AnalyzerBeans has been focused on making it work in a unittesting perspective, but now we've reached a point where it is also possible to invoke the engine from the command line.&lt;/p&gt;&lt;p&gt;Since we haven't released AnalyzerBeans yet, you will still have to check out the code and build it yourself. It's rather easy - it just requires &lt;a href="http://subversion.tigris.org/"&gt;Subversion&lt;/a&gt; and &lt;a href="http://maven.apache.org/"&gt;Maven&lt;/a&gt;. First, check out the code:&lt;/p&gt;&lt;blockquote&gt;&amp;gt; svn co http://eobjects.org/svn/AnalyzerBeans/trunk AnalyzerBeans&lt;/blockquote&gt;&lt;p&gt;Now build it:&lt;/p&gt;&lt;blockquote&gt;&amp;gt; cd AnalyzerBeans&lt;br /&gt;&amp;gt; mvn install&lt;/blockquote&gt;&lt;p&gt;And now run the example job that's in there:&lt;/p&gt;&lt;blockquote&gt;&amp;gt; java -jar target/AnalyzerBeans.jar \&lt;br /&gt;&amp;gt; -configuration examples/conf.xml -job examples/employees_job.xml&lt;/blockquote&gt;&lt;p&gt;The job will transform/standardize the "full name" and "email address" columns of a CSV-file (located in the &lt;b&gt;examples&lt;/b&gt;-folder) and then print out value distribution and string analysis results for the standardized tokens: First name, Last name, Email username, Email domain.&lt;/p&gt;&lt;p&gt;If you've gone this far, you've probably also tried opening the xml-files &lt;b&gt;employees_job.xml&lt;/b&gt; and &lt;b&gt;conf.xml&lt;/b&gt; in the &lt;b&gt;examples&lt;/b&gt;-folder. Maybe you've even figured out that the conf.xml describes the application setup and that the employees_job.xml file describes the job contents. You can edit these files as you please to further explore the application. I will be sure to update my blog soon with some more examples. Also one of the next features of the command line interface will be to print the available Analyzers and Transformers in order to make it easier to author the xml job-files.&lt;/p&gt;&lt;p&gt;If you're just trying this out now and if you are getting excited about AnalyzerBeans, here are my previous blog posts on the subject. Please don't hesistate to let me know what you think.&lt;/p&gt;&lt;ul&gt;&lt;li&gt;2010-08: &lt;a href="http://kasper.eobjects.dk/2010/08/nice-abstraction-over-regular.html"&gt;A nice abstraction over Regular Expressions&lt;/a&gt;&lt;/li&gt;&lt;li&gt;2010-08: &lt;a href="http://kasper.eobjects.dk/2010/08/visualizations-and-api-documentation.html"&gt;Visualizations and API Documentation for AnalyzerBeans&lt;/a&gt;&lt;/li&gt;&lt;li&gt;2010-07: &lt;a href="http://kasper.eobjects.dk/2010/07/data-transformation-added-to.html"&gt;Data transformation added to AnalyzerBeans&lt;/a&gt;&lt;/li&gt;&lt;li&gt;2009-06: &lt;a href="http://kasper.eobjects.dk/2009/06/introducing-analyzerbeans.html"&gt;Introducing AnalyzerBeans&lt;/a&gt;&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1380841442477610563?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1380841442477610563/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1380841442477610563' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1380841442477610563'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1380841442477610563'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/08/now-you-can-run-analyzerbeans-from.html' title='Now you can run AnalyzerBeans (from the shell)'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-6907117016619197578</id><published>2010-08-27T18:11:00.009+02:00</published><updated>2010-09-25T16:23:54.083+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzerbeans'/><category scheme='http://www.blogger.com/atom/ns#' term='expression'/><category scheme='http://www.blogger.com/atom/ns#' term='regular'/><category scheme='http://www.blogger.com/atom/ns#' term='tokens'/><category scheme='http://www.blogger.com/atom/ns#' term='pattern'/><category scheme='http://www.blogger.com/atom/ns#' term='regex'/><category scheme='http://www.blogger.com/atom/ns#' term='matching'/><category scheme='http://www.blogger.com/atom/ns#' term='expressions'/><title type='text'>A nice abstraction over regular expressions</title><content type='html'>&lt;p&gt;Often when you're developing data profiling, matching or cleansing software, you're dealing with expression matching, typically through regular expressions (regexes). One thing that I find is that it is often a tedious and error-prone task to define and reuse regexes or parts of regexes. In &lt;a href="http://eobjects.org/trac/wiki/AnalyzerBeans"&gt;AnalyzerBeans&lt;/a&gt; there's a huge need for easier and reusable pattern matching. To counter this requirement I've come up with a helper-class, Named Pattern, which you can use to match and identify tokens in the patterns in a type-safe and easy way. Here's a short example for matching and tokenizing names based on two simple patterns:&lt;/p&gt;&lt;blockquote class="prettyprint lang-java"&gt;//First define an enum with the tokens in the pattern(s)&lt;br /&gt;public enum NamePart { FIRSTNAME, LASTNAME, TITULATION }&lt;br /&gt;&lt;br /&gt;// The two patterns&lt;br /&gt;NamedPattern&amp;lt;NamePart&amp;gt; &lt;b style="color: red;"&gt;p1&lt;/b&gt; = new NamedPattern("TITULATION. FIRSTNAME LASTNAME", NamePart.class);&lt;br /&gt;NamedPattern&amp;lt;NamePart&amp;gt; &lt;b style="color: red;"&gt;p2&lt;/b&gt; = new NamedPattern("FIRSTNAME LASTNAME", NamePart.class);&lt;br /&gt;NamedPattern&amp;lt;NamePart&amp;gt; &lt;b style="color: green;"&gt;p3&lt;/b&gt; = new NamedPattern("LASTNAME, FIRSTNAME", NamePart.class);&lt;br /&gt;&lt;br /&gt;// notice the type parameter &amp;lt;NamePart&amp;gt; - the match result type is typesafe!&lt;br /&gt;NamedPatternMatch&amp;lt;NamePart&amp;gt; &lt;b&gt;match&lt;/b&gt; = &lt;b style="color: red;"&gt;p1&lt;/b&gt;.match("Sørensen, Kasper");&lt;br /&gt;assert &lt;b&gt;match&lt;/b&gt; == null;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;match&lt;/b&gt; = &lt;b style="color: red;"&gt;p2&lt;/b&gt;.match("Sørensen, Kasper");&lt;br /&gt;assert &lt;b&gt;match&lt;/b&gt; == null;&lt;br /&gt;&lt;br /&gt;// here's a match!&lt;br /&gt;&lt;b&gt;match&lt;/b&gt; = &lt;b style="color: green;"&gt;p3&lt;/b&gt;.match("Sørensen, Kasper");&lt;br /&gt;assert &lt;b&gt;match&lt;/b&gt; != null;&lt;br /&gt;&lt;br /&gt;String firstName = &lt;b&gt;match&lt;/b&gt;.get(NamePart.FIRSTNAME);&lt;br /&gt;String lastName = &lt;b&gt;match&lt;/b&gt;.get(NamePart.LASTNAME);&lt;br /&gt;String titulation = &lt;b&gt;match&lt;/b&gt;.get(NamePart.TITULATION);&lt;/blockquote&gt;&lt;p&gt;All in all I think that the NamedPattern class (and the NamedPatternMatch) in combination with your own enums is a pretty elegant way to do string pattern matching. There's also a way to specify how the underlying regular expression will be built by letting the enum implement the HasGroupLiteral interface.&lt;/p&gt;&lt;p&gt;Developers can dive into the details of these classes and interfaces at the &lt;a href="http://eobjects.org/analyzerbeans/apidocs/"&gt;Javadoc / API Documentation&lt;/a&gt; for AnalyzerBeans (package &lt;i&gt;org.eobjects.analyzer.util&lt;/i&gt;).&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-6907117016619197578?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/6907117016619197578/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=6907117016619197578' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/6907117016619197578'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/6907117016619197578'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/08/nice-abstraction-over-regular.html' title='A nice abstraction over regular expressions'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-93080728588825162</id><published>2010-08-09T18:30:00.009+02:00</published><updated>2010-08-09T19:14:35.843+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='transformer'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzerbeans'/><category scheme='http://www.blogger.com/atom/ns#' term='analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='api'/><category scheme='http://www.blogger.com/atom/ns#' term='images'/><category scheme='http://www.blogger.com/atom/ns#' term='visualization'/><category scheme='http://www.blogger.com/atom/ns#' term='documentation'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='flow'/><title type='text'>Visualizations and API documentation for AnalyzerBeans</title><content type='html'>&lt;p&gt;I've spent a few hours trying to capture some of the basic principles of data flow and execution in my new favourite spare-time project &lt;a href="http://eobjects.org/trac/wiki/AnalyzerBeans"&gt;AnalyzerBeans&lt;/a&gt;. Here's the results, that you will also find available in the &lt;a href="http://eobjects.org/analyzerbeans/apidocs/"&gt;API Documentation&lt;/a&gt;.&lt;/p&gt;&lt;p&gt;The first image shows the relationship analyzers, transformers and the data that they consume:&lt;/p&gt;&lt;img src="http://eobjects.org/analyzerbeans/apidocs/org/eobjects/analyzer/beans/doc-files/AnalyzerBeans-dataflow.png" alt="Data flow" width="70%" /&gt;&lt;p&gt;The second image shows a "close-up" of a row of data. Some of that values originate from the actual datastore, while some of the values may be virtual, generated by a chain of transformers:&lt;/p&gt;&lt;img src="http://eobjects.org/analyzerbeans/apidocs/org/eobjects/analyzer/data/doc-files/AnalyzerBeans-inputrow.png" alt="InputRow" width="70%" /&gt;&lt;p&gt;Enjoy :)&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-93080728588825162?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/93080728588825162/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=93080728588825162' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/93080728588825162'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/93080728588825162'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/08/visualizations-and-api-documentation.html' title='Visualizations and API documentation for AnalyzerBeans'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5946994804810960226</id><published>2010-07-19T18:20:00.002+02:00</published><updated>2010-07-19T18:30:45.339+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzerbeans'/><category scheme='http://www.blogger.com/atom/ns#' term='analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='api'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Data transformation added to AnalyzerBeans</title><content type='html'>I have been doing a lot of improvements to the API of &lt;a href="http://eobjects.org/trac/wiki/AnalyzerBeans"&gt;AnalyzerBeans&lt;/a&gt; - a sandbox project that I am very passionate about. In short it is a new Data Profiling/Analysis engine that I think will eventually replace the core parts of &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt;. So here's a bit about "what's cookin'":&lt;div&gt;&lt;ul&gt;&lt;li&gt;The largest of the new features is that it is now possible to transform data before it will be analyzed. The idea here is that it should be possible to tokenize/split/convert/etc. values before they enter the analysis. This means one fundamental change to analyzers, namely that they consume data through an intermediary input-column type which can be virtual (to represent eg. a token) or physical (to represent a "regular" column in a datastore). The new component type, "Transformer Beans" will support all the same cool stuff that &lt;a href="http://kasper.eobjects.dk/2009/06/introducing-analyzerbeans.html"&gt;I've already introduced to the analyzer components&lt;/a&gt; like dependency injection, persistent/scalable collections, annotation-driven composition and registration etc.&lt;/li&gt;&lt;li&gt;Another neat thing that I'm currently finishing up is an Analysis Job Builder. The idea is that analysis jobs should be immutable because this makes it a lot safer to parallelize the process of executing the jobs. Immutable structure are very good to work with when you are executing but they tend to be tedious when you're &lt;i&gt;building&lt;/i&gt; the structure. So I'm also adding an API for building the jobs which will emphasize type-safety and syntactic neatness to make it easy to programmatically manage and verify the jobs you're building. This will make it a lot easier to build a good UI for AnalyzerBeans.&lt;/li&gt;&lt;/ul&gt;&lt;/div&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5946994804810960226?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5946994804810960226/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5946994804810960226' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5946994804810960226'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5946994804810960226'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/07/data-transformation-added-to.html' title='Data transformation added to AnalyzerBeans'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4785448675500562171</id><published>2010-05-23T13:19:00.014+02:00</published><updated>2010-05-23T21:01:15.459+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='integration'/><category scheme='http://www.blogger.com/atom/ns#' term='api'/><category scheme='http://www.blogger.com/atom/ns#' term='interface'/><category scheme='http://www.blogger.com/atom/ns#' term='programming'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Using DataCleaner's API to run jobs as a part of your Java applications</title><content type='html'>Yesterday someone asked me if there where any examples around of how to set up scheduled &lt;a href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt; jobs in a Java EE environment. While the common case have been to just use the Command-Line Interface (CLI) for DataCleaner together with cron-jobs or Windows' scheduled tasks, he had a point - for some organizations this kind of solution would be insufficient - invocation through code would be better if you already have a lot of Java applications running (eg. in a Java EE environment).&lt;br /&gt;&lt;br /&gt;So here's my response to that request - I'll try to walk you through the process of invoking DataCleaner through it's &lt;a href="http://eobjects.org/datacleaner/apidocs/current/"&gt;Java API&lt;/a&gt;. I'll start out with an example of a Profiling job - validation is quite similar but I'll cover that in another blog-post later. It's my ambition that these walkthroughs will eventually end up in the DataCleaner &lt;a href="http://datacleaner.eobjects.org/docs"&gt;online docs&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;The package &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/execution/package-summary.html"&gt;dk.eobjects.datacleaner.execution&lt;/a&gt; holds the main entrypoints for setting up and running a DataCleaner job. First you need to have a &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/execution/DataCleanerExecutor.html"&gt;DataCleanerExecutor&lt;/a&gt; - in this case we wanna execute profiling jobs so we'll use a factory-method for setting up our executor accordingly:&lt;br /&gt;&lt;blockquote&gt;DataCleanerExecutor&amp;lt;&lt;span style="font-size:-1;"&gt;&lt;code&gt;&lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/profiler/ProfilerJobConfiguration.html" title="class in dk.eobjects.datacleaner.profiler"&gt;ProfilerJobConfiguration&lt;/a&gt;,&lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/profiler/IProfileResult.html" title="interface in dk.eobjects.datacleaner.profiler"&gt;IProfileResult&lt;/a&gt;,&lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/profiler/IProfile.html" title="interface in dk.eobjects.datacleaner.profiler"&gt;IProfile&lt;/a&gt;&lt;/code&gt;&lt;/span&gt;&amp;gt; &lt;span style="font-weight: bold;"&gt;executor&lt;/span&gt; = ProfilerExecutorCallback.createExecutor();&lt;/blockquote&gt;Notice the three type-parameters. They dictate that this executor handles &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/profiler/ProfilerJobConfiguration.html"&gt;ProfilerJobConfigurations&lt;/a&gt;, it produces &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/profiler/IProfileResult.html"&gt;IProfileResults&lt;/a&gt; and it executes using &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/profiler/IProfile.html"&gt;IProfile&lt;/a&gt;'s.&lt;br /&gt;&lt;br /&gt;Now it's time to create some profiling jobs. We do this by adding configuration-objects that describe the tasks at hand (the executor will handle the lifecycle of the actual profilers for us). In this example we'll configure a ValueDistributionProfile:&lt;br /&gt;&lt;blockquote&gt;&lt;span style="color: rgb(0, 153, 0);"&gt;// for this purpose we never use the "displayName" param for anything, so we just enter "valuedist" or whatever&lt;/span&gt;&lt;br /&gt;IProfileDescriptor &lt;span style="font-weight: bold;"&gt;descriptor&lt;/span&gt; = new BasicProfileDescriptor("valuedist", ValueDistributionProfile.class);&lt;br /&gt;&lt;br /&gt;ProfilerJobConfiguration &lt;span style="font-weight: bold;"&gt;jobConfiguration&lt;/span&gt; = new ProfilerJobConfiguration(&lt;span style="font-weight: bold;"&gt;descriptor&lt;/span&gt;);&lt;br /&gt;&lt;br /&gt;&lt;span style="color: rgb(0, 153, 0);"&gt;// all properties are by convention placed as constants within a PROPERTY_ prefix in their profile class&lt;/span&gt;&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;jobConfiguration&lt;/span&gt;.addProfileProperty(ValueDistributionProfile.PROPERTY_TOP_N, "5");&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;jobConfiguration&lt;/span&gt;.addProfileProperty(ValueDistributionProfile.PROPERTY_BOTTOM_N,  "5");&lt;br /&gt;&lt;/blockquote&gt;Also we need to select which columns to profile as a part of our job configuration. DataCleaner uses &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel&lt;/a&gt; for it's datastore connectivity so we need to find our retrieve our column definitions using a MetaModel DataContext. I'll examplify with typical MySQL database connection values but there  are a lot of other options in the &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/data/DataContextSelection.html"&gt;DataContextSelection&lt;/a&gt;  class:&lt;blockquote&gt;DataContextSelection &lt;span style="font-weight: bold;"&gt;dcs&lt;/span&gt; = new DataContextSelection();&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;dcs&lt;/span&gt;.selectDatabase("jdbc:mysql://localhost/mydb",  null, "username", "password", new TableType[] {TableType.VIEW,  TableType.TABLE});&lt;br /&gt;DataContext &lt;span style="font-weight: bold;"&gt;dc&lt;/span&gt; = dcs.getDataContext();&lt;br /&gt;Table[] &lt;span style="font-weight: bold;"&gt;tables&lt;/span&gt; = dc.getDefaultSchema().getTables();&lt;br /&gt;&lt;br /&gt;&lt;span style="color: rgb(0, 153, 0);"&gt;// I'll just add &lt;span style="font-style: italic;"&gt;all&lt;/span&gt; columns from &lt;span style="font-style: italic;"&gt;all&lt;/span&gt; tables!&lt;/span&gt;&lt;br /&gt;List&amp;lt;Column&amp;gt; &lt;span style="font-weight: bold;"&gt;allColumns&lt;/span&gt; = new LinkedList&amp;lt;Column&amp;gt;();&lt;br /&gt;for (Table &lt;span style="font-weight: bold;"&gt;table&lt;/span&gt; : &lt;span style="font-weight: bold;"&gt;tables&lt;/span&gt;) {&lt;br /&gt;&amp;nbsp; &amp;nbsp; &amp;nbsp; &lt;span style="font-weight: bold;"&gt;allColumns&lt;/span&gt;.addAll(Arrays.asList(&lt;span style="font-weight: bold;"&gt;table&lt;/span&gt;.getColumns));&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;jobConfiguration&lt;/span&gt;.setColumns(&lt;span style="font-weight: bold;"&gt;allColumns&lt;/span&gt;);&lt;br /&gt;&lt;/blockquote&gt;Finally, we add our job configuration to the executor:&lt;br /&gt;&lt;blockquote&gt;&lt;span style="font-weight: bold;"&gt;executor&lt;/span&gt;.addJobConfiguration(&lt;span style="font-weight: bold;"&gt;jobConfiguration&lt;/span&gt;);&lt;/blockquote&gt;If we want to, we can add our own observers to recieve notifications as the job progresses. For example, in the DataCleaner GUI we use an observer for updating the on-screen progress indicators.&lt;br /&gt;&lt;blockquote&gt;&lt;span style="font-weight: bold;"&gt;executor&lt;/span&gt;.addProgressObserver(...);&lt;/blockquote&gt;Another optional feature is to set the execution options through an &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/execution/ExecutionConfiguration.html"&gt;ExecutionConfiguration&lt;/a&gt; object. As an example we can configure our job to use multithreading assigning more than one connection and/or by allowing more than one query to execute at a time (the example below has a max thread count of 2*5 = 10):&lt;br /&gt;&lt;blockquote&gt;ExecutionConfiguration &lt;span style="font-weight: bold;"&gt;conf&lt;/span&gt; = new ExecutionConfiguration();&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;conf&lt;/span&gt;.setMaxQueriesPerConnection(2);&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;conf&lt;/span&gt;.setMaxConnections(5);&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;executor&lt;/span&gt;.setExecutionConfiguration(&lt;span style="font-weight: bold;"&gt;conf&lt;/span&gt;);&lt;/blockquote&gt;And now it's time to kick off the executor! When we do this we provide our DataContextSelection object, which holds the connection information needed to spawn connections to the datastore.&lt;br /&gt;&lt;blockquote&gt;&lt;span style="font-weight: bold;"&gt;executor&lt;/span&gt;.execute(&lt;span style="font-weight: bold;"&gt;dcs&lt;/span&gt;);&lt;/blockquote&gt;Alternatively you can start the execution asynchronously by calling:&lt;br /&gt;&lt;blockquote&gt;&lt;span style="font-weight: bold;"&gt;executor&lt;/span&gt;.execute(&lt;span style="font-weight: bold;"&gt;dcs&lt;/span&gt;, false);&lt;/blockquote&gt;And now ... you're done. All you have to do now is investigate the results. You retrieve these calling:&lt;br /&gt;&lt;blockquote&gt;List&amp;lt;IProfileResult&amp;gt; &lt;span style="font-weight: bold;"&gt;results&lt;/span&gt; = executor.getResults();&lt;/blockquote&gt;Consider using one of the &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/export/IResultExporter.html"&gt;result exporters&lt;/a&gt; in the DataCleaner API (providing support for CSV, XML and HTML export) or use some custom code to retrieve just the metrics of your interest by traversing the &lt;a href="http://eobjects.org/datacleaner/apidocs/current/dk/eobjects/datacleaner/profiler/IProfileResult.html"&gt;IProfileResult&lt;/a&gt; model.&lt;br /&gt;&lt;br /&gt;I hope this walkthrough has brought some light to the subject of invoking DataCleaner through it's Java API. It's the first time I sit down and try to explain this part of the application so I might have missed some points but I think the major ideas are present. Let me know what you think - and suggestions for improving the API is always welcome.&lt;br /&gt;&lt;br /&gt;A couple of notes&lt;code&gt;&lt;/code&gt; to the use of DataCleaner's execution API:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Notice in the &lt;a href="http://eobjects.org/datacleaner/apidocs/current/"&gt;javadocs&lt;/a&gt; that almost all the classes covered in this blog-post has a serialize() and a static deserialize(...) method. These are used for saving and loading the configuration to/from XML documents. So if you've already created your jobs using DataCleaners GUI then you can save these jobs (as .dcp or .dcv files) and restore them using using deserialize(...). That might be an easier and quicker path to solving your problems if you're not much keen of setting up everything in code.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;If you want a shortcut for setting up the ProfileDescriptors and ValidationRuleDescriptors, then take a look at DataCleaners bundled XML files, &lt;span style="font-style: italic;"&gt;datacleaner-config.xml&lt;/span&gt;, &lt;span style="font-style: italic;"&gt;datacleaner-profiler-modules.xml&lt;/span&gt; and &lt;span style="font-style: italic;"&gt;datacleaner-validator-modules.xml&lt;/span&gt;. These a &lt;a href="http://www.springsource.org/"&gt;Spring Framework&lt;/a&gt; based files that are currently used by DataCleaner as a convenient way to serve these descriptors. You should be able to load the objects easily using Spring and then you'll have the descriptors set up automatically.&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4785448675500562171?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4785448675500562171/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4785448675500562171' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4785448675500562171'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4785448675500562171'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/05/using-datacleaners-api-to-run-jobs-as.html' title='Using DataCleaner&apos;s API to run jobs as a part of your Java applications'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-913360962352359390</id><published>2010-05-15T21:46:00.010+02:00</published><updated>2010-05-19T22:01:48.177+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='hibernate'/><category scheme='http://www.blogger.com/atom/ns#' term='jpa'/><category scheme='http://www.blogger.com/atom/ns#' term='flushing'/><category scheme='http://www.blogger.com/atom/ns#' term='jms'/><category scheme='http://www.blogger.com/atom/ns#' term='transaction'/><category scheme='http://www.blogger.com/atom/ns#' term='seam'/><category scheme='http://www.blogger.com/atom/ns#' term='manual'/><category scheme='http://www.blogger.com/atom/ns#' term='flush'/><title type='text'>Watch out for manual flushing in JBoss Seam</title><content type='html'>I've done quite a lot of development in &lt;a href="http://www.seamframework.org/"&gt;JBoss Seam&lt;/a&gt; for the last six months and overall I'm quite enthusiastic. Also I'm looking forward to using some of the features of Seam in their new Java EE 6 incarnations (ie. in short: &lt;a href="http://java.sun.com/javaee/6/docs/api/javax/inject/Inject.html"&gt;@Inject&lt;/a&gt; in stead of &lt;a href="http://docs.jboss.org/seam/2.2.1.CR1/api/org/jboss/seam/annotations/In.html"&gt;@In&lt;/a&gt;, &lt;a href="http://java.sun.com/javaee/6/docs/api/javax/enterprise/inject/Produces.html"&gt;@Produces&lt;/a&gt; in stead of &lt;a href="http://docs.jboss.org/seam/2.2.1.CR1/api/org/jboss/seam/annotations/Factory.html"&gt;@Factory&lt;/a&gt;, &lt;a href="http://docs.jboss.org/seam/2.2.1.CR1/api/org/jboss/seam/annotations/Unwrap.html"&gt;@Unwrap&lt;/a&gt; and &lt;a href="http://docs.jboss.org/seam/2.2.1.CR1/api/org/jboss/seam/annotations/Out.html"&gt;@Out&lt;/a&gt;, and &lt;a href="http://java.sun.com/javaee/6/docs/api/javax/enterprise/context/ConversationScoped.html"&gt;@ConversationScoped&lt;/a&gt; in stead of &lt;a href="http://docs.jboss.org/seam/2.2.1.CR1/api/org/jboss/seam/annotations/Scope.html"&gt;@Scope&lt;/a&gt;(&lt;a href="http://docs.jboss.org/seam/2.2.1.CR1/api/org/jboss/seam/ScopeType.html#CONVERSATION"&gt;CONVERSATION&lt;/a&gt;) ;-)).&lt;br /&gt;&lt;br /&gt;One key feature of Seam is it's persistence strategy and at first glance it's quite a cool thing. The idea is to use an &lt;span style="font-weight: bold;"&gt;extended persistence context&lt;/span&gt; which means that your entities are kept managed across transactions. The extended persistence context is very important as Seam wraps each request in transactions and all changes to entities caused by actions in the request will then be automatically propagated to the database. The extended persistence context saves you from having to call merge(...) in order to reattach your entities all the time. Calling merge(...) is a heavy operation so this is good.&lt;br /&gt;&lt;br /&gt;This pattern makes a lot of sense just until the point where you want to make it possible to edit an entity throughout a few requests but then forget about your changes (because the user changes his/her mind). To make this use case possible the Seam guys are advocating to use "MANUAL flushing" which means that Hibernate won't flush updates to the database unless you programmatically tell it to. Seems smart - here's the idea: Hibernate will keep track of all changes made in transactions (requests) but won't flush them. At a certain time the user will typically hit a "Save changes" button and then everything will be flushed.&lt;br /&gt;&lt;br /&gt;Apart from the fact that MANUAL flushing is a Hibernate-specific feature not available with other JPA persistence providers, this pattern has three very serious flaws:&lt;br /&gt;&lt;ol&gt;&lt;li&gt;Any &lt;span style="font-weight: bold;"&gt;query&lt;/span&gt; fired will cause an implicid flush - even if the flush-mode is MANUAL. This means that if your conversation involves a query, your changes will be flushed even though you haven't invoked the flush-method yourself. Again - this almost certainly rules out the possibility to use MANUAL flushing in just about any conversation I can imagine (especially if you want to enable navigation by nested conversations). Queries are a good example of something that used to be a '&lt;a href="http://domaindrivendesign.org/node/127"&gt;side-effect-free function&lt;/a&gt;' but is now something  that can impose a lot of unintended changes in state.&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;NOTE:&lt;/span&gt; I stand (a bit) corrected here - I was adviced that this behaviour can be avoided by setting the flushmode of the query to COMMIT and it seems to work.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;While we're at the topic - if you wan't to enable nested conversations then you will have to do a lot of plumbing code to make sure that the nested parts &lt;span style="font-weight: bold;"&gt;doesn't invoke&lt;/span&gt; the flush-method and then end up flushing on behalf of the parent-conversation also. It &lt;span style="font-style: italic;"&gt;IS&lt;/span&gt; possible to code your way around this flaw but it's a very serious prohibitant to compose your application of reusable nested conversations.&lt;/li&gt;&lt;li&gt;The Seam guys seem to have failed to realize that transactions are used for &lt;span style="font-weight: bold;"&gt;other purposes&lt;/span&gt; than saving entities. For example, if you're using JMS, you would send messages at commit-time which means that developers of the JMS-dispatch code will assume that if a commit takes place, data has been persisted. If the message contains for example id's of updated entities the messagehandler will access these entities &lt;span style="font-style: italic;"&gt;before &lt;/span&gt;any updates has taken place because the updates haven't been flushed!&lt;/li&gt;&lt;/ol&gt;I think that these flaws make it utterly hard to develop applications using MANUAL flushing because of the intrinsics it imposes on the flow in your application. In this light, I'm quite pleased that they didn't include manual flushing in Java EE 6 (or rather, JPA 2).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-913360962352359390?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/913360962352359390/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=913360962352359390' title='5 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/913360962352359390'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/913360962352359390'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/05/watch-out-for-manual-flushing-in-jboss.html' title='Watch out for manual flushing in JBoss Seam'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>5</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2628768548785862074</id><published>2010-01-30T13:51:00.010+01:00</published><updated>2010-01-30T14:21:14.323+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='multiple'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='datasources'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='datastores'/><title type='text'>Query multiple datastores with MetaModel 1.2</title><content type='html'>I am currently packaging and distributing the new version of &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel&lt;/a&gt; - version 1.2. In this blog-post I'll introduce what I think is the most exciting thing introduced in this version: Composite DataContexts aka. "Query multiple datasources with a single query". Or in plain english: You can now treat multiple datastores as if they where one.&lt;br /&gt;&lt;br /&gt;An example:&lt;br /&gt;Imagine that you want to match a database table with the contents of an excel spreadsheet, you can easily create a query that reads from both datastores and does all the joining, filtering etc. that is possible with regular MetaModel queries.&lt;br /&gt;&lt;blockquote&gt;DataContext &lt;span style="color: rgb(255, 0, 0);"&gt;database&lt;/span&gt; = DataContextFactory.createJdbcDataContext( myConnection );&lt;br /&gt;DataContext &lt;span style="color: rgb(204, 51, 204);"&gt;spreadsheet&lt;/span&gt; = DataContextFactory.createExcelDataContext(new File("my_spreadsheet.xls");&lt;br /&gt;&lt;br /&gt;Table dbTable = database.getDefaultSchema().getTableByName("my_db_table");&lt;br /&gt;Column dbPkColumn = dbTable.getColumnByName("my_primary_key");&lt;br /&gt;Table excelTable = spreadsheet.getDefaultSchema().getTableByName("my_sheet");&lt;br /&gt;Column excelFkColumn = excelTable.getColumnByName("my_foreign_key");&lt;br /&gt;&lt;br /&gt;&lt;span style="color: rgb(51, 204, 0);"&gt;// now we create a composite DataContext which enables us&lt;br /&gt;// to explore and query both DataContexts transparently&lt;br /&gt;// through the same DataContext reference!&lt;/span&gt;&lt;br /&gt;DataContext &lt;span style="color: rgb(0, 102, 0);"&gt;composite&lt;/span&gt; = DataContextFactory.createCompositeDataContext( &lt;span style="color: rgb(255, 0, 0);"&gt;database&lt;/span&gt;, &lt;span style="color: rgb(153, 51, 153);"&gt;spreadsheet&lt;/span&gt; );&lt;br /&gt;&lt;br /&gt;&lt;span style="color: rgb(51, 204, 0);"&gt;// example query with carthesian product and cross-datastore where clause&lt;/span&gt;&lt;br /&gt;Query q = new Query();&lt;br /&gt;q.from(dbTable).from(excelTable);&lt;br /&gt;q.select(dbTable.getColumns())&lt;br /&gt;q.select(excelTable.getColumns());&lt;br /&gt;q.where(dbPkColumn, OperatorType.EQUALS_TO, excelFkColumn);&lt;br /&gt;&lt;br /&gt;DataSet ds = composite.executeQuery(q)&lt;br /&gt;&lt;span style="color: rgb(51, 204, 0);"&gt;// read the result&lt;/span&gt;&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;... How cool is that?&lt;br /&gt;&lt;br /&gt;Of course if a query is posted to the composite DataContext that spans multiple underlaying DataContexts, it will most likely spawn a case of "client side joining" which will not perform well compared to co-locating the datastores. But often that is not possible (or practical if it's just a case of ad-hoc analysis) so I believe that the new composite DataContext feature can add some real value to a lot of projects!&lt;br /&gt;&lt;br /&gt;Other notable news in MetaModel 1.2: We now support MS Access databases and dBase (.dbf) database-files.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-2628768548785862074?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2628768548785862074/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=2628768548785862074' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2628768548785862074'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2628768548785862074'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2010/01/query-multiple-datastores-with.html' title='Query multiple datastores with MetaModel 1.2'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3534423077474092181</id><published>2009-11-30T18:00:00.032+01:00</published><updated>2009-11-30T20:03:03.186+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='mondrian'/><category scheme='http://www.blogger.com/atom/ns#' term='schema'/><category scheme='http://www.blogger.com/atom/ns#' term='mysql'/><category scheme='http://www.blogger.com/atom/ns#' term='cube'/><category scheme='http://www.blogger.com/atom/ns#' term='jndi'/><category scheme='http://www.blogger.com/atom/ns#' term='datasource'/><category scheme='http://www.blogger.com/atom/ns#' term='jdbc'/><category scheme='http://www.blogger.com/atom/ns#' term='jboss'/><title type='text'>Setting up Mondrian for JNDI DataSources, XML/A and custom CSS styles</title><content type='html'>The other day I decided that I wanted to set up mondrian as an adhoc analysis package for &lt;a href="http://www.lundogbendsen.dk/"&gt;Lund&amp;amp;Bendsen&lt;/a&gt;s intranet application, "Yacs". I didn't want to install a large application like Pentaho for just this specific need - rather I wanted to deploy just a simple cube schema, reuse the Java EE datasource definition that the intranet-application was already using and apply some basic styling to comply with the corporate profile. Doing these steps showed a lot more complex than I first imagined, primarily because I think the examples of the standalone mondrian distribution are overly complex and poorly designed, incapsulation-wise. Here is a list of steps I recommend doing to set up Mondrian "the right way":&lt;br /&gt;&lt;ol&gt;&lt;li&gt;Deploy your Java EE datasource in a container/database-specific way. If you JBoss and MySQL like I do, here's an example datasource descriptor, place it in the deploy-folder with a filename like "mydatasource.xml" (&lt;span style="font-weight: bold;"&gt;bold&lt;/span&gt; parts should be replaced with your specific configuration):&lt;br /&gt;&lt;blockquote&gt;&amp;lt;datasources&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;local-tx-datasource&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;jndi-name&amp;gt;&lt;span style="font-weight: bold;"&gt;MyDataSource&lt;/span&gt;&amp;lt;/jndi-name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;connection-url&amp;gt;jdbc:mysql://localhost/&lt;span style="font-weight: bold;"&gt;mydatabase&lt;/span&gt;&amp;lt;/connection-url&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;driver-class&amp;gt;com.mysql.jdbc.Driver&amp;lt;/driver-class&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;user-name&amp;gt;&lt;span style="font-weight: bold;"&gt;username&lt;/span&gt;&amp;lt;/user-name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;password&amp;gt;&lt;span style="font-weight: bold;"&gt;password&lt;/span&gt;&amp;lt;/password&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;min-pool-size&amp;gt;5&amp;lt;/min-pool-size&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;max-pool-size&amp;gt;20&amp;lt;/max-pool-size&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;valid-connection-checker-class-name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;           org.jboss.resource.adapter.jdbc.vendor.MySQLValidConnectionChecker&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;/valid-connection-checker-class-name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;metadata&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;           &amp;lt;type-mapping&amp;gt;mySQL&amp;lt;/type-mapping&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;/metadata&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;/local-tx-datasource&amp;gt;&lt;br /&gt;&amp;lt;/datasources&amp;gt;&lt;/blockquote&gt;&lt;/li&gt;&lt;li&gt;Unzip the mondrian.war archive so you can edit the application.&lt;/li&gt;&lt;li&gt;Add a container specific mapping of the DataSource to this application. In JBoss this is done by placing a file called "jboss-web.xml" in the WEB-INF folder with this content:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;jboss-web&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;resource-ref&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;res-ref-name&amp;gt;&lt;span style="font-weight: bold;"&gt;MyDataSource&lt;/span&gt;&amp;lt;/res-ref-name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;res-type&amp;gt;javax.sql.DataSource&amp;lt;/res-type&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;jndi-name&amp;gt;java:/&lt;span style="font-weight: bold;"&gt;MyDataSource&lt;/span&gt;&amp;lt;/jndi-name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;/resource-ref&amp;gt;&lt;br /&gt;&amp;lt;/jboss-web&amp;gt;&lt;/blockquote&gt;&lt;/li&gt;&lt;li&gt;Now edit the WEB-INF/web.xml file and add the following entry inside the &amp;lt;web-app&amp;gt; element:&lt;blockquote&gt;&amp;lt;resource-ref&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;res-ref-name&amp;gt;&lt;span style="font-weight: bold;"&gt;MyDataSource&lt;/span&gt;&amp;lt;/res-ref-name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;res-type&amp;gt;javax.sql.DataSource&amp;lt;/res-type&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;res-auth&amp;gt;Container&amp;lt;/res-auth&amp;gt;&lt;br /&gt;&amp;lt;/resource-ref&amp;gt;&lt;/blockquote&gt;&amp;lt;/web-app&amp;gt;&lt;/li&gt;&lt;li&gt;Also change the mapping of the JPivot filter so it goes like this:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;filter-mapping&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;filter-name&amp;gt;JPivotController&amp;lt;/filter-name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;url-pattern&amp;gt;&lt;span style="font-weight: bold;"&gt;/*&lt;/span&gt;&amp;lt;/url-pattern&amp;gt;&lt;br /&gt;&amp;lt;/filter-mapping&amp;gt;&lt;/blockquote&gt;&lt;/li&gt;&lt;li&gt;Create a schema-file and save it under WEB-INF/mycatalog.xml. I won't give instructions as to writing schemas - &lt;a href="http://mondrian.pentaho.org/documentation/schema.php"&gt;Mondrians documentation cover this quite well&lt;/a&gt;.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;If you want to enable XML/A support, use this as a template for your WEB-INF/&lt;span style="font-weight: bold;"&gt;datasources&lt;/span&gt;.xml file (notice here that we use the application-local JNDI string here (including java:comp/env/...)):&lt;br /&gt;&lt;blockquote&gt;&amp;lt;datasources&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;datasource&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;datasourcename&amp;gt;Provider=Mondrian;DataSource=&lt;span style="font-weight: bold;"&gt;MyDataSource&lt;/span&gt;;&amp;lt;/datasourcename&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;datasourcedescription&amp;gt;My example datasource&amp;lt;/datasourcedescription&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;url&amp;gt;http://localhost:8888/mondrian/xmla&amp;lt;/url&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;datasourceinfo&amp;gt;Provider=mondrian;DataSource=java:comp/env/&lt;span style="font-weight: bold;"&gt;MyDataSource&lt;/span&gt;;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;/datasourceinfo&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;providername&amp;gt;Mondrian&amp;lt;/providername&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;providertype&amp;gt;MDP&amp;lt;/providertype&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;authenticationmode&amp;gt;Unauthenticated&amp;lt;/authenticationmode&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;catalogs&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;           &amp;lt;catalog name="&lt;span style="font-weight: bold"&gt;MyCatalog&lt;/span&gt;"&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;               &amp;lt;definition&amp;gt;/WEB-INF/&lt;span style="font-weight: bold;"&gt;mycatalog&lt;/span&gt;.xml&amp;lt;/definition&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;           &amp;lt;/catalog&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;       &amp;lt;/catalogs&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;   &amp;lt;/datasource&amp;gt;&lt;br /&gt;&amp;lt;/datasources&amp;gt;&lt;/blockquote&gt;&lt;/li&gt;&lt;li&gt;The views on the cube that you now want must be created as individual JSP pages. One of the things that are really lacking in the mondrian bundle is reasonable JSP pages with less complexity and reasonable reuse of datasources. Here's how I build mine (you can more or less put this stuff into the testpage.jsp page and then you're not dependent on all the jsp include stuff) - notice now that the datasource-reference entered here is the just the last, ie. the reference in web.xml:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;%     if (session.getAttribute("query01") == null) { %&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;lt;jp:mondrianquery id="query01" datasource="&lt;b&gt;MyDataSource&lt;/b&gt;" cataloguri="&lt;b&gt;/WEB-INF/mydatasource.xml&lt;/b&gt;"&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;!-- &lt;i&gt;Initial MDX query goes here&lt;/i&gt; --&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;lt;/jp:mondrianquery&amp;gt;&lt;br /&gt;&amp;lt;%     } %&amp;gt;&lt;/blockquote&gt;&lt;/li&gt;&lt;/ol&gt;Horray! Now you've got connection pooling, sharing and all the other cool stuff that Java EE DataSources provide. Next step: Add styling. First: Remove all the stylesheets that come with mondrian. You don't need them because it's actually quite a lot easier to add your own than to try and modify the existing ones. Here's the result of 10 minutes of styling:&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_UpvxZrigQfQ/SxQE3kmtedI/AAAAAAAAAEQ/gd7K00_zflc/s1600/pivot-screenshot.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 205px;" src="http://2.bp.blogspot.com/_UpvxZrigQfQ/SxQE3kmtedI/AAAAAAAAAEQ/gd7K00_zflc/s400/pivot-screenshot.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5409954405144164818" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;The important CSS id's and classes are:&lt;br /&gt;&lt;ul&gt;&lt;br /&gt;&lt;li&gt;To control styling of the pivot table and its cells it's important that you add &amp;lt;div&amp;gt; around this tag in the JSP:&lt;blockquote&gt;&amp;lt;wcf:render ref="query01" xslUri="/WEB-INF/jpivot/table/mdxtable.xsl" xslCache="true" /&amp;gt;&lt;/blockquote&gt;You can then use your div's id-attribute to target cells, headers etc. in your stylesheet.&lt;br /&gt;&lt;/li&gt;&lt;br /&gt;&lt;li&gt;The class &lt;b&gt;.heading-heading&lt;/b&gt;: Used for headings of headings, ie. the top-level blue cells in the screenshot above.&lt;/li&gt;&lt;br /&gt;&lt;li&gt;The classes &lt;b&gt;.column-heading-span, .column-heading-even, .column-heading-odd&lt;/b&gt;: Used for column headers, ie. the gray cells above the pivot table content.&lt;/li&gt;&lt;br /&gt;&lt;li&gt;The classes &lt;b&gt;.column-heading-span, .column-heading-even, .column-heading-odd&lt;/b&gt;: Used for column headers, ie. the gray cells above the pivot table content.&lt;/li&gt;&lt;br /&gt;&lt;li&gt;The classes &lt;b&gt;.row-heading-span, .row-heading-even, .row-heading-odd&lt;/b&gt;: Used for row headers, ie. the gray cells to the left of the pivot table content.&lt;/li&gt;&lt;br /&gt;&lt;li&gt;The classes &lt;b&gt;.cell-even&lt;/b&gt; and &lt;b&gt;.cell-odd&lt;/b&gt;: Used for the cells on even and odd rows.&lt;/li&gt;&lt;br /&gt;&lt;/ul&gt;I encourage the mondrian crew to clean up the reference application, but I'm guessing they are using the messy configuration to convince people to switch to a full Pentaho deployment :-)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3534423077474092181?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3534423077474092181/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3534423077474092181' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3534423077474092181'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3534423077474092181'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/11/setting-up-mondrian-using-datasources.html' title='Setting up Mondrian for JNDI DataSources, XML/A and custom CSS styles'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_UpvxZrigQfQ/SxQE3kmtedI/AAAAAAAAAEQ/gd7K00_zflc/s72-c/pivot-screenshot.png' height='72' width='72'/><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-108689999356304173</id><published>2009-11-09T16:02:00.003+01:00</published><updated>2009-11-09T16:19:11.674+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='jpa'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='persistence'/><category scheme='http://www.blogger.com/atom/ns#' term='intranet'/><category scheme='http://www.blogger.com/atom/ns#' term='danish'/><category scheme='http://www.blogger.com/atom/ns#' term='n+1'/><title type='text'>JPA and the N+1 select problem</title><content type='html'>Warning to readers: This blog entry contains references to articles only  available in Danish. So if you keep on reading be prepared to weep if you want to follow my suggestions ;-)&lt;br /&gt;&lt;br /&gt;Lately I've been working hard on Lund&amp;amp;Bendsens intranet and processes involved around it. I've been using &lt;a href="http://www.lundogbendsen.dk/display/web/Seam+2"&gt;JBoss Seam&lt;/a&gt; for the most part and overall I'm quite thrilled about this choice of web framework. One of the cool parts about Seam is the way it integrates with &lt;a href="http://www.lundogbendsen.dk/display/web/Java+Persistence+API+%28JPA%29"&gt;Java Persistence API (JPA)&lt;/a&gt;/&lt;a href="http://www.hibernate.org"&gt;Hibernate&lt;/a&gt; and handles my persistence context even when I'm rendering the views for the intranet.&lt;br /&gt;&lt;br /&gt;At the same time when I have been developing the intranet features in Seam and JPA, my colleague Kenn Sano wrote an excellent article about the &lt;a href="http://www.lundogbendsen.dk/display/web/JPA+performancekiller+N-plus-1+select-problemet"&gt;N+1 Select Problem&lt;/a&gt; in JPA. Here's what it all comes down to (In Danish):&lt;br /&gt;&lt;blockquote&gt;"Man kan [med JPA] "vandre rundt" i en objektgraf og på magisk vis hentes data, som stilles til rådighed i takt med, at vi traverserer - dvs. objekters tilstand indlæses fra databasen alt imens vi bevæger os rundt i objektgrafen. Hvis man ikke er opmærksom på, hvordan JPA fungerer, kan det resultere i mange SQL-kald mod databasen, hvilket kan have stor negativ indvirkning på performance."&lt;/blockquote&gt;I was doing this plentifully in Seam. When presenting a list of courses or a list of students, Seam lets me easily traverse the items in the list by using EL-expressions such as "#{course.location.address}" which involved several N+1 performance penalties. For instance on the list of all planned courses, including their locations, enrolled students etc. I observed a whole of N*4+1 query penalty. So there's no doubt you need to be aware of the impact of your querying strategy.&lt;br /&gt;&lt;br /&gt;Note: I'm not blaming JBoss Seam for this behavior ... Seam makes everything a whole lot easier and when everything is easy you just tend to forget to think yourself ;-) Anyways - go &lt;a href="http://www.lundogbendsen.dk/display/web/JPA+performancekiller+N-plus-1+select-problemet"&gt;read the article&lt;/a&gt; if you're interested in JPA  and understand Danish.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-108689999356304173?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/108689999356304173/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=108689999356304173' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/108689999356304173'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/108689999356304173'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/11/jpa-and-n1-select-problem.html' title='JPA and the N+1 select problem'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3646388469850906473</id><published>2009-09-08T12:41:00.003+02:00</published><updated>2009-09-08T13:30:46.350+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='book publish pentaho solutions open source business intelligence'/><title type='text'>New book on Open Source Business Intelligence tells the DataCleaner-story</title><content type='html'>&lt;a href="http://www.wiley.com/WileyCDA/WileyTitle/productCd-0470484322.html"&gt;&lt;img style="float: right; margin-left: 10px;" src="http://eobjects.org/resources/pentaho-solutions-cover.jpg" alt="" /&gt; &lt;/a&gt;&lt;p&gt; About half a year ago we received an exciting inquiry from Jos van Dongen on behalf of him and his co-author Roland Bouman, telling us that they where writing a new book about &lt;strong&gt;Open Source Business Intelligence&lt;/strong&gt; and in particular &lt;strong&gt;Pentaho&lt;/strong&gt;-based solutions. And for this they where looking into &lt;a class="wiki" href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt; for the data profiling section of the book! &lt;/p&gt; &lt;p&gt; The book is now out! It's called "Pentaho Solutions" and it's published by Wiley Publishing. You can read &lt;a class="ext-link" href="http://www.wiley.com/WileyCDA/WileyTitle/productCd-0470484322.html"&gt;&lt;span class="icon"&gt;about it and buy it on their website&lt;/span&gt;&lt;/a&gt; as well. &lt;/p&gt; &lt;p&gt; The book contains a walkthrough for building a data warehouse using Open Souce tools and in doing so applying &lt;a class="wiki" href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt; for the important job of profiling and validation. &lt;/p&gt; &lt;p&gt; We congratulate Roland Bouman and Jos van Dongen for their great work to promote Open Source Business Intelligence and thank them for mentioning &lt;a class="wiki" href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt; while they're at it! &lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3646388469850906473?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3646388469850906473/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3646388469850906473' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3646388469850906473'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3646388469850906473'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/09/new-book-on-open-source-business.html' title='New book on Open Source Business Intelligence tells the DataCleaner-story'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4004113707538269234</id><published>2009-07-14T10:48:00.003+02:00</published><updated>2009-07-14T10:53:17.692+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open source data quality release announcement datacleaner 1.5.2'/><title type='text'>Open Source Data Quality with DataCleaner 1.5.2</title><content type='html'>Today I've &lt;a href="http://datacleaner.eobjects.org/newsitem/datacleaner-1.5.2-released"&gt;announced the release of DataCleaner 1.5.2&lt;/a&gt;, yay! I'm pretty excited about this release as I think this is probably the biggest of the minor releases to date. And especially I hope to see that our new "single jar file" distribution-option will attract new users. Go read &lt;a href="http://datacleaner.eobjects.org/newsitem/datacleaner-1.5.2-released"&gt;the announcement&lt;/a&gt; for more details now :)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4004113707538269234?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4004113707538269234/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4004113707538269234' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4004113707538269234'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4004113707538269234'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/07/open-source-data-quality-with.html' title='Open Source Data Quality with DataCleaner 1.5.2'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3935231432204291533</id><published>2009-06-28T13:13:00.007+02:00</published><updated>2010-09-12T12:05:14.986+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='analyzerbeans'/><category scheme='http://www.blogger.com/atom/ns#' term='analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='datastore'/><category scheme='http://www.blogger.com/atom/ns#' term='profiler'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='annotations'/><title type='text'>Introducing AnalyzerBeans</title><content type='html'>It's been some time now since I first designed the core API's of the &lt;a href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt; project and as time goes on, some of my initial assumptions about the design of profilers, validation rules and so on have shown to be less-than-optimal in regards to flexibility and scalability for the application. This is why yesterday I decided to do a major change in the roadmap for the project:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;The idea about the "webmonitor" application (DataCleaner 2.0) have been cancelled for now. If anyone wants to realize this idea it's still something that I am very much interested in, but as you will see I have found that other priorities are more important.&lt;/li&gt;&lt;li&gt;A new project have been founded - for now as a "sandbox" project: &lt;span style="font-weight: bold;"&gt;AnalyzerBeans&lt;/span&gt;. AnalyzerBeans is a rethought architecture for datastore profiling, validation etc. - in one word: "Analysis". When this project is stable and mature we will probably be ready for something I like to think of as a new DataCleaner 2.0.&lt;/li&gt;&lt;/ul&gt;So why rethink datastore analysis? Because the "old way" have proven to be very cumbersome for some tasks that I did not initially realise would have importance. The current DataCleaner design assumes that all profiles, validation rules etc. do serial-processing of rows. This is not always the best way to do processing although it simplifies optimization of the execution-mechanism because all components execute in the same way and can thus share result sets etc. In AnalyzerBeans we want the best of both worlds: Flexibility to do al sorts of weird processing and rigidity for the lot of profilers which actually &lt;span style="font-style: italic;"&gt;do process rows serially&lt;/span&gt;.&lt;br /&gt;&lt;br /&gt;The solution is a new annotation based component-model. Each profiler, validation rule etc. will not have to implement certain interfaces because we can now mix and match annotations to the specific type of analysis-component - each "AnalyzerBean". There are a lot more interesting features available when we introduce an annotation-based model, but let me first give you a simple example of how a regular row-processing DataCleaner-style profile would look like:&lt;br /&gt;&lt;blockquote&gt;@AnalyzerBean(name="Row counter", execution=ExecutionType.ROW_PROCESSING)&lt;br /&gt;public class MySerialCounter {&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;    @Configured("Table to count")&lt;br /&gt;&amp;nbsp; &amp;nbsp;    private Table table;&lt;br /&gt;&amp;nbsp; &amp;nbsp;    private long count = 0l;&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;    @Run&lt;br /&gt;&amp;nbsp; &amp;nbsp;    public void run(Row row, long count) {&lt;br /&gt;&amp;nbsp; &amp;nbsp;    &amp;nbsp; &amp;nbsp; this.count += count;&lt;br /&gt;&amp;nbsp; &amp;nbsp;    }&lt;br /&gt;}&lt;br /&gt;&lt;/blockquote&gt;Now this is not so impressive. I've just replaced the IProfile interface of DataCleaner's API's with some annotations. But notice how I've gotten rid of the ProfileDescriptor class which was used to hold metadata about the profiler. Instead the annotations represent the class metadata. This is actually excactly what annotations are for :-) Also notice that I've gotten a type-safe configuration-property using the @Configured annotation. This means that I don't have to parse a string, ask for a Table of the corresponding name etc. And the UI will become a LOT more easy to develop because of type-safe facilities like this.&lt;br /&gt;&lt;br /&gt;But an even more exciting way to use the new API is when creating a whole new type of profiler, an exploring AnalyzerBean:&lt;br /&gt;&lt;blockquote&gt;@AnalyzerBean(name="Row counter", execution=ExecutionType.EXPLORING)&lt;br /&gt;public class MySerialCounter {&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;    @Configured("Table to count")&lt;br /&gt;&amp;nbsp; &amp;nbsp;    private Table table;&lt;br /&gt;&amp;nbsp; &amp;nbsp;    private Number count;&lt;br /&gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;    @Run&lt;br /&gt;&amp;nbsp; &amp;nbsp;    public void run(DataContext dc) {&lt;br /&gt;&amp;nbsp; &amp;nbsp;    &amp;nbsp; &amp;nbsp; DataSet ds = dc.executeQuery(new Query().selectCount().from(table));&lt;br /&gt;&amp;nbsp; &amp;nbsp;    &amp;nbsp; &amp;nbsp; ds.next();&lt;br /&gt;&amp;nbsp; &amp;nbsp;    &amp;nbsp; &amp;nbsp; this.count = (Number) row.getValue(0);&lt;br /&gt;&amp;nbsp; &amp;nbsp;    &amp;nbsp; &amp;nbsp; ds.close();&lt;br /&gt;&amp;nbsp; &amp;nbsp;    }&lt;br /&gt;}&lt;/blockquote&gt;Now this is something totally new: A component that can gain total control of the DataContext and create it's own query based on some @Configured parameters. I imagine that this programming model will give us complete flexibility to do exiting new things that was impossible in the DataCleaner-framework: Join testing, non-serial Value Distribution etc.&lt;br /&gt;&lt;br /&gt;There are a few other annotations available to the AnalyzerBean-developers but I will take a look at them in a more in-depth blog-entry later. For now - let me know if you like the ideas and if you have any comments. Anyone who would like to help out in the development of the AnalyzerBeans project should visit our &lt;a href="http://eobjects.org/trac/wiki/AnalyzerBeans"&gt;wiki page&lt;/a&gt; on the subject.&lt;br /&gt;&lt;br /&gt;&lt;p&gt;&lt;b&gt;Update (2010-09-12)&lt;/b&gt;&lt;/p&gt;&lt;p&gt;A lot has happened to AnalyzerBeans since this blog entry. Here's a list of blog entries (in chronological order) that will help interested readers dive deeper into the development of AnalyzerBeans:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;&lt;a href="http://kasper.eobjects.org/2010/07/data-transformation-added-to.html"&gt;Data transformation added to AnalyzerBeans&lt;/a&gt;&lt;/li&gt;&lt;li&gt;&lt;a href="http://kasper.eobjects.org/2010/08/visualizations-and-api-documentation.html"&gt;Visualizations and API documentation for AnalyzerBeans&lt;/a&gt;&lt;/li&gt;&lt;li&gt;&lt;a href="http://kasper.eobjects.org/2010/08/now-you-can-run-analyzerbeans-from.html"&gt;Now you can run AnalyzerBeans (from the shell)&lt;/a&gt;&lt;/li&gt;&lt;li&gt;&lt;a href="http://kasper.eobjects.org/2010/09/developing-value-transformer-using.html"&gt;Developing a value transformer using the AnalyzerBeans Java API&lt;/a&gt;&lt;/li&gt;&lt;li&gt;&lt;a href="http://kasper.eobjects.org/2010/09/more-instructions-for-authoring.html"&gt;More instructions for authoring AnalyzerBeans jobs&lt;/a&gt;&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3935231432204291533?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3935231432204291533/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3935231432204291533' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3935231432204291533'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3935231432204291533'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/06/introducing-analyzerbeans.html' title='Introducing AnalyzerBeans'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-6593352163906217591</id><published>2009-06-13T19:57:00.005+02:00</published><updated>2009-06-13T20:19:23.502+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='csv'/><category scheme='http://www.blogger.com/atom/ns#' term='standard measures'/><category scheme='http://www.blogger.com/atom/ns#' term='text'/><category scheme='http://www.blogger.com/atom/ns#' term='file'/><category scheme='http://www.blogger.com/atom/ns#' term='string analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='benchmark'/><title type='text'>Performance benchmark: DataCleaner thrives on lower column counts</title><content type='html'>Today I've conducted an experiment. After fixing a bug related to CSV-file-reading in &lt;a href="http://datacleaner.eobjects.org/"&gt;DataCleaner&lt;/a&gt;, I was wondering how performance was impacted by different kinds of CSV file compositions. The reason that I suspected that this could impact performance is that CSV files with many columns will require a somewhat larger chunk of memory in order to keep a single row in memory compared to CSV files with fewer columns. In the older versions of DataCleaner we discovered that using 200 or more columns would actually make the application &lt;a href="http://datacleaner.eobjects.org/topic/75/Profiler-1-5-1---lots-of-columns-need-horiz-scroll"&gt;run out of memory&lt;/a&gt;! Fortunately, this bug is fixed, but there is still a significant performance penalty, as this blog post will hopefully show.&lt;br /&gt;&lt;br /&gt;I auto-generated three files for the benchmark: &lt;b&gt;"huge.csv"&lt;/b&gt; with 2.000 columns and 16.000 rows, &lt;b&gt;"long.csv"&lt;/b&gt; with 250 columns and 128.000 rows and &lt;b&gt;"slim.csv"&lt;/b&gt; with only 10 columns and a roaring 3.200.000 rows. Together, each file has &lt;b&gt;32.000.000 cells&lt;/b&gt; to be profiled. I set up a profiler job with the profiles &lt;b&gt;Standard measures&lt;/b&gt; and &lt;b&gt;String analysis&lt;/b&gt; on all columns.&lt;br /&gt;&lt;br /&gt;Here are the (surprising?) results:&lt;br /&gt;&lt;br /&gt;&lt;table style="width: 668px; height: 100px;"&gt;&lt;tbody&gt;&lt;tr&gt;&lt;td style="font-weight: bold;"&gt;filename&lt;/td&gt;  &lt;td style="text-align: right;" width="86"&gt;&lt;b&gt;rows&lt;/b&gt;&lt;/td&gt;    &lt;td style="text-align: right;" width="86"&gt;&lt;b&gt;columns&lt;/b&gt;&lt;/td&gt;        &lt;td style="text-align: right;" width="86"&gt;&lt;b&gt;start time&lt;/b&gt;&lt;/td&gt;    &lt;td style="text-align: right;" width="86"&gt;&lt;b&gt;end time&lt;/b&gt;&lt;/td&gt;&lt;td style="vertical-align: top; text-align: right;"&gt;&lt;span style="font-weight: bold;"&gt;total time&lt;/span&gt;&lt;br /&gt;&lt;/td&gt;           &lt;/tr&gt;   &lt;tr&gt;    &lt;td style="text-align: left;" height="17"&gt;huge.csv&lt;/td&gt;    &lt;td style="text-align: right;" sdval="16000" sdnum="9;"&gt;16000&lt;/td&gt;    &lt;td style="text-align: right;" sdval="2000" sdnum="9;"&gt;&lt;span style="font-size:130%;"&gt;2000&lt;/span&gt;&lt;/td&gt;        &lt;td style="text-align: right;" sdval="0.788055555555555" sdnum="9;0;HH:MM:SS"&gt;18:54:48&lt;/td&gt;    &lt;td style="text-align: right;" sdval="0.819768518518518" sdnum="9;0;HH:MM:SS"&gt;19:40:28&lt;/td&gt;&lt;td style="vertical-align: top; text-align: right;"&gt;&lt;span style="font-size:130%;"&gt;45:40&lt;br /&gt;&lt;/span&gt;&lt;/td&gt;           &lt;/tr&gt;   &lt;tr&gt;    &lt;td style="text-align: left;" height="17"&gt;long.csv&lt;/td&gt;    &lt;td style="text-align: right;" sdval="128000" sdnum="9;"&gt;128000&lt;/td&gt;    &lt;td style="text-align: right;" sdval="250" sdnum="9;"&gt;&lt;span style="font-size:130%;"&gt;250&lt;/span&gt;&lt;/td&gt;        &lt;td style="text-align: right;" sdval="0.822835648148148" sdnum="9;0;HH:MM:SS"&gt;19:44:53&lt;/td&gt;    &lt;td style="text-align: right;"&gt;19:52:31&lt;/td&gt;&lt;td style="vertical-align: top; text-align: right;"&gt;&lt;span style="font-size:130%;"&gt;7:38&lt;br /&gt;&lt;/span&gt;&lt;/td&gt;           &lt;/tr&gt;   &lt;tr&gt;    &lt;td style="text-align: left;"&gt;slim.csv&lt;/td&gt;    &lt;td style="text-align: right;"&gt;3200000&lt;/td&gt;    &lt;td style="text-align: right;"&gt;&lt;span style="font-size:130%;"&gt;10&lt;/span&gt;&lt;/td&gt;        &lt;td style="text-align: right;"&gt;19:53:46&lt;/td&gt;    &lt;td style="text-align: right;"&gt;19:55:03&lt;/td&gt;&lt;td style="vertical-align: top; text-align: right;"&gt;&lt;span style="font-size:130%;"&gt;1:17&lt;br /&gt;&lt;/span&gt;&lt;/td&gt;           &lt;/tr&gt;&lt;/tbody&gt;&lt;/table&gt;&lt;br /&gt;&lt;br /&gt;So the bottom line is: Lowering the number of columns has a very significant, positive impact on performance. Having a lot of columns means that you will need to hold a lot more data in memory and needless to say you will have to replace this large chunk of memory a lot of times during the execution of a large profiler job. Going all the way from 45 minutes to 1½ is quite an improvement - so don't pre-join tables or anything like that before you run them through your profiler.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-6593352163906217591?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/6593352163906217591/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=6593352163906217591' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/6593352163906217591'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/6593352163906217591'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/06/performance-benchmark-datacleaner.html' title='Performance benchmark: DataCleaner thrives on lower column counts'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-746228660994453714</id><published>2009-06-05T01:55:00.003+02:00</published><updated>2009-06-05T02:04:23.639+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='jfreechart'/><category scheme='http://www.blogger.com/atom/ns#' term='api'/><category scheme='http://www.blogger.com/atom/ns#' term='javaone'/><category scheme='http://www.blogger.com/atom/ns#' term='blogging'/><category scheme='http://www.blogger.com/atom/ns#' term='chart'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>eobjects.org @ JavaOne</title><content type='html'>I am currently hanging out at the lovely &lt;a href="http://java.sun.com/javaone"&gt;JavaOne conference&lt;/a&gt; in San Francisco, checking out cool new Java technology and meeting interesting people. Of course I'm here as a representative from &lt;a href="http://www.lundogbendsen.dk"&gt;my employer&lt;/a&gt; I also do some &lt;a href="http://www.lundogbendsen.dk/pages/viewrecentblogposts.action?key=web"&gt;blogging &lt;/a&gt;(in Danish).&lt;br /&gt;&lt;br /&gt;Yesterday I saw an interesting session about &lt;a href="http://www.jfree.org/jfreechart/"&gt;JFreeChart &lt;/a&gt;and surviving as an Open Source professional. Dave Gilbert told us about how he has managed to live from his hobby as a JFreeChart developer, about cool new features of the excellent charting API and about the struggles of making money on Open Source. Very fascinating stuff and I hope that everybody in the chart-consuming business will give it a try. I was quite happily surprised to see the new interactive chart functionality that has been put into the API - I'm wondering how that hadn't found my attention before now! It gave rise to a couple of ideas  (or rather: Sparked my motivation) for me to try and implement charting in &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-746228660994453714?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/746228660994453714/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=746228660994453714' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/746228660994453714'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/746228660994453714'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/06/eobjectsorg-javaone.html' title='eobjects.org @ JavaOne'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3758371056922148990</id><published>2009-04-29T17:53:00.007+02:00</published><updated>2009-04-29T18:55:08.359+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='example'/><category scheme='http://www.blogger.com/atom/ns#' term='maven2'/><category scheme='http://www.blogger.com/atom/ns#' term='ejb'/><category scheme='http://www.blogger.com/atom/ns#' term='build'/><category scheme='http://www.blogger.com/atom/ns#' term='packaging'/><category scheme='http://www.blogger.com/atom/ns#' term='enterprise'/><category scheme='http://www.blogger.com/atom/ns#' term='tutorial'/><category scheme='http://www.blogger.com/atom/ns#' term='seam'/><category scheme='http://www.blogger.com/atom/ns#' term='maven'/><category scheme='http://www.blogger.com/atom/ns#' term='ear'/><title type='text'>Seam, EJB's and EAR-packaging in Maven</title><content type='html'>Lately I've been designing &lt;a style="font-weight: bold;" href="http://www.lundogbendsen.dk/display/web/Seam+2"&gt;a new course&lt;/a&gt; on the splendid JBoss Seam (2.1) Web and Java EE framework. One thing that strikes me, being an enterprise Java developer, is that almost no good examples of setting up Seam using Maven exist. I realize that Seam's appeal comes from a tradition of wanting things to be nice and easy, so the seam-gen tool has absolute merits for creating your project but to me the downside of this approach is that you're bound to use Ant as a build tool and for a lot of reasons I won't go into here, I strongly prefer using Maven.&lt;br /&gt;&lt;br /&gt;The examples of using Maven to build Seam applications that I have been able to track down have been limited in a variety of ways:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Most of them required to use a specific Maven archetype which altered the default project layout.&lt;/li&gt;&lt;li&gt;Almost all of them where out of date and was bound to a very "custom" maven repository which I don't think is suitable for enterprise application infrastructure.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;Those that didn't fall within the former two points where restricted to web-applications (WAR-files) only and thus didn't support using EJB's in the Seam applications.&lt;/li&gt;&lt;/ul&gt;So this blog post is going to be a walk-through of how I managed (after quite some effort) to configure Maven to be able to build an EAR file containing EJB's and a web-application that is able to utilize these EJB's as Seam components. I'm only highlighting the most interesting parts - You can&lt;a style="font-weight: bold;" href="http://eobjects.org/resources/download/Seam-EJB-and-Maven.zip"&gt; download the complete example&lt;/a&gt; for free - and in contrast to the examples I have seen, it's very bare-boned and won't take more than a few seconds to alter to your needs.&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;Project structure&lt;br /&gt;&lt;/span&gt;OK - to meet the demand of packaging an EAR file I've created a Maven project consisting of three modules. It's possible that this part can be optimized a bit since I use a seperate module for doing all the packaging (which to my understanding is the way to do it). The modules are:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;&lt;span style="font-style: italic;"&gt;ejbs&lt;/span&gt; - containing the EJB's and possibly other "backend"-stuff like JPA-classes and so on.&lt;/li&gt;&lt;li&gt;&lt;span style="font-style: italic;"&gt;web&lt;/span&gt; - containing the web application&lt;/li&gt;&lt;li&gt;&lt;span style="font-style: italic;"&gt;packaging&lt;/span&gt; - for packaging the EAR file containg the two other modules (and Seam)&lt;/li&gt;&lt;/ul&gt;&lt;span style="font-weight: bold;"&gt;Which module depends on what?&lt;/span&gt;&lt;br /&gt;One of the big issues that I faced was figuring out how to configure Maven's dependency management framework to generate a correct set of EAR, JAR and WAR files. If this is done wrong it won't work and you'll waste hours (or at least I did) trying to figure out what is wrong.&lt;br /&gt;&lt;br /&gt;All provided dependencies go into the &lt;span style="font-weight: bold;"&gt;parent project&lt;/span&gt;. These are all the dependencies provided by your container (plus Seam itself which will be packaged within the EAR file). You will need to add JBoss's Maven repository to resolve these dependencies. Here are the important parts of my &lt;span style="font-weight: bold;"&gt;parent project pom&lt;/span&gt;:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;repositories&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;repository&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;id&amp;gt;repository.jboss.org&amp;lt;/id&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;name&amp;gt;JBoss Repository&amp;lt;/name&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;url&amp;gt;http://repository.jboss.org/maven2&amp;lt;/url&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/repository&amp;gt;&lt;br /&gt;&amp;lt;/repositories&amp;gt;&lt;br /&gt;&amp;lt;modules&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;module&amp;gt;ejbs&amp;lt;/module&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;module&amp;gt;web&amp;lt;/module&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;module&amp;gt;packaging&amp;lt;/module&amp;gt;&lt;br /&gt;&amp;lt;/modules&amp;gt;&lt;br /&gt;&amp;lt;dependencies&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;org.jboss.seam&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jboss-seam&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;2.1.1.GA&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;scope&amp;gt;provided&amp;lt;/scope&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;javax.faces&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jsf-api&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;1.2_02&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;scope&amp;gt;provided&amp;lt;/scope&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;org.hibernate&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;hibernate-entitymanager&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;3.4.0.GA&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;scope&amp;gt;provided&amp;lt;/scope&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;org.hibernate&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;hibernate-validator&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;3.1.0.GA&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;scope&amp;gt;provided&amp;lt;/scope&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;javax.servlet&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;servlet-api&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;2.5&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;scope&amp;gt;provided&amp;lt;/scope&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;javax.servlet.jsp&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jsp-api&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;2.1&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;scope&amp;gt;provided&amp;lt;/scope&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;javax.ejb&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;ejb-api&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;3.0&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;lt;/dependencies&amp;gt;&lt;/blockquote&gt;In the &lt;span style="font-weight: bold;"&gt;ejbs module&lt;/span&gt; you will &lt;span style="font-weight: bold;"&gt;not need any dependencies&lt;/span&gt;!&lt;br /&gt;&lt;br /&gt;In the &lt;span style="font-weight: bold;"&gt;web module&lt;/span&gt; you will need to add &lt;span style="font-weight: bold;"&gt;seam-ui&lt;/span&gt; (excluding the "core" seam, because it is provided within the EAR file), &lt;span style="font-weight: bold;"&gt;facelets&lt;/span&gt; and other web-dependencies such as RichFaces, seam-pdf or whatever. Here are the dependencies of my &lt;span style="font-weight: bold;"&gt;web module pom&lt;/span&gt;:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;dependencies&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;com.sun.facelets&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jsf-facelets&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;1.1.11&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;org.jboss.seam&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jboss-seam-ui&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;2.1.1.GA&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;exclusions&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;exclusion&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;org.jboss.seam&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jboss-seam&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/exclusion&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/exclusions&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;  &amp;lt;/dependencies&amp;gt;&lt;/blockquote&gt;If you want to write Seam components that pertain only to the &lt;span style="font-weight: bold;"&gt;web module&lt;/span&gt;, then you will probably also need to include the &lt;span style="font-style: italic;"&gt;ejbs module as a dependency in the web module pom&lt;/span&gt;. But remember to set the scope as &lt;span style="font-weight: bold;"&gt;provided&lt;/span&gt;, because it's all within the same EAR.&lt;br /&gt;&lt;br /&gt;Last but not least you will need to write the &lt;span style="font-weight: bold;"&gt;packaging module pom&lt;/span&gt;. The &lt;a href="http://docs.jboss.com/seam/2.1.1.GA/reference/en-US/html/configuration.html#d0e23778"&gt;Seam Documentation&lt;/a&gt; have been used to guide how this pom is structured: The &lt;span style="font-weight: bold;"&gt;ejbs module&lt;/span&gt; needs to be a registered EJB module. The &lt;span style="font-weight: bold;"&gt;web module&lt;/span&gt; needs to be a registered WAR module. And &lt;span style="font-weight: bold;"&gt;Seam itself&lt;/span&gt; needs to be a registered EJB module as well! &lt;span style="font-weight: bold;"&gt;JBoss EL&lt;/span&gt; needs to be placed in the /lib directory of the EAR and you need to make sure to exclude the &lt;span style="font-weight: bold;"&gt;EL API&lt;/span&gt; dependency from several artifacts - otherwise you'll get weird classpath issues when deploying. The important parts of the &lt;span style="font-weight: bold;"&gt;packaging module pom&lt;/span&gt; looks like this (pay attention to the parts in bold):&lt;br /&gt;&lt;blockquote&gt;&amp;lt;build&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;plugins&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;plugin&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;maven-ear-plugin&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;configuration&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;modules&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;jarmodule&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;org.jboss.el&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jboss-el&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;includeinapplicationxml&amp;gt;false&amp;lt;/includeinapplicationxml&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;bundledir&amp;gt;lib&amp;lt;/bundledir&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/jarmodule&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;webmodule&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;!-- add &lt;span style="font-weight: bold;"&gt;web module&lt;/span&gt; groupId and artifactId here --&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;contextroot&amp;gt;&lt;span style="font-weight: bold;"&gt;/seam-ejb-ex&lt;/span&gt;&amp;lt;/contextroot&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/webmodule&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/modules&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/configuration&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/plugin&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/plugins&amp;gt;&lt;br /&gt;&amp;lt;/build&amp;gt;&lt;br /&gt;&amp;lt;dependencies&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;!-- add &lt;span style="font-weight: bold;"&gt;ejbs module&lt;/span&gt; groupId, artifactId and version here --&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;type&amp;gt;ejb&amp;lt;/type&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;!-- add &lt;span style="font-weight: bold;"&gt;web module&lt;/span&gt; groupId, artifactId and version here --&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;type&amp;gt;war&amp;lt;/type&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;org.jboss.seam&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jboss-seam&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;2.1.1.GA&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;type&amp;gt;ejb&amp;lt;/type&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;exclusions&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;exclusion&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;javax.el&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;el-api&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/exclusion&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/exclusions&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;dependency&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;org.jboss.el&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;jboss-el&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;version&amp;gt;1.0_02.CR2&amp;lt;/version&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;type&amp;gt;jar&amp;lt;/type&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;exclusions&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;exclusion&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;groupId&amp;gt;javax.el&amp;lt;/groupid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;artifactId&amp;gt;el-api&amp;lt;/artifactid&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/exclusion&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;  &amp;lt;/exclusions&amp;gt;&lt;br /&gt;&amp;nbsp;&amp;nbsp;  &amp;lt;/dependency&amp;gt;&lt;br /&gt;  &amp;lt;/dependencies&amp;gt;&lt;/blockquote&gt;So that's all the Maven configuration. Now to some "gotchas" in regards to Seam configuration. A lot of these things are being "disguised" by seam-gen so consider this a list of things you might have forgotten if you're building the project from the bottom up:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Remember to put empty &lt;span style="font-weight: bold;"&gt;seam.properties&lt;/span&gt; files in to the resource folders of &lt;span style="font-weight: bold;"&gt;both&lt;/span&gt; the ejbs and web &lt;span style="font-weight: bold;"&gt;modules&lt;/span&gt;.&lt;/li&gt;&lt;li&gt;Remember to register the Seam interceptors (as described in the &lt;a href="http://docs.jboss.com/seam/2.1.1.GA/reference/en-US/html/configuration.html#d0e23572"&gt;Seam Documentation&lt;/a&gt;) in the &lt;span style="font-weight: bold;"&gt;ejb-jar.xml&lt;/span&gt; file, located in the &lt;span style="font-weight: bold;"&gt;resources/META-INF&lt;/span&gt; folder &lt;span style="font-weight: bold;"&gt;ejbs module&lt;/span&gt;.&lt;/li&gt;&lt;li&gt;The &lt;span style="font-weight: bold;"&gt;components.xml&lt;/span&gt; file should be located in the web module's &lt;span style="font-weight: bold;"&gt;WEB-INF&lt;/span&gt; folder. But you can't just copy the file from a seam-gen project because seam-gen dynamically replaces some "magic strings" in it to see to container compliancy. Instead you will have to add this static entry to the components.xml file yourself:&lt;br /&gt;&lt;blockquote&gt;&amp;lt;core:init pattern="seam-ejb-ex/#{ejbName}/local"&amp;gt;&amp;lt;/core:init&amp;gt;&lt;/blockquote&gt;Note the &lt;b&gt;seam-ejb-ex&lt;/b&gt; part of that static jndi pattern string. You will have to replace this part of the string with the context root entry that was highlighted in the &lt;b&gt;packaging module pom&lt;/b&gt;!&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;I hope this blog entry have cleared up some of the confusion that I met when I tried building Maven-based Seam/EJB projects. Please let me know if it works out and remember that you can &lt;a style="font-weight: bold;" href="http://eobjects.org/resources/download/Seam-EJB-and-Maven.zip"&gt;download the full example&lt;/a&gt; and use it as a reference as you like.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3758371056922148990?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3758371056922148990/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3758371056922148990' title='17 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3758371056922148990'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3758371056922148990'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/04/seam-ejbs-and-ear-packaging-in-maven.html' title='Seam, EJB&apos;s and EAR-packaging in Maven'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>17</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-7795807447998251428</id><published>2009-04-20T19:57:00.002+02:00</published><updated>2009-04-20T19:59:27.182+02:00</updated><title type='text'>DataCleaner 1.5.1 released</title><content type='html'>&lt;p&gt;I'm happy to announce the release of DataCleaner version 1.5.1. This release is a minor release, nevertheless containing a few nice features - especially for the users who are enjoying the exporting features that was introduced in 1.5:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;An additional HTML export format have been added to the built-in export formats (usable when exporting Profiler results in the desktop app and when executing the runjob command-line tool).&lt;/li&gt;&lt;li&gt;The export format is now choosable directly in the desktop app.&lt;/li&gt;&lt;li&gt;Four new measures where added to the String Analysis profile: avg. chars and max/min/avg white spaces.&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;The new version of DataCleaner is (as always) downloadable for free on the &lt;a href="http://datacleaner.eobjects.org/downloads"&gt;downloads page&lt;/a&gt; and feedback from users is also greatly appreciated, ie:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;Fill out our &lt;a href="http://datacleaner.eobjects.org/survey"&gt;online user survey&lt;/a&gt;, or&lt;/li&gt;&lt;li&gt;Post your comments and questions at our &lt;a href="http://datacleaner.eobjects.org/forum/1"&gt; discussion forum&lt;/a&gt;.&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;We hope that you all enjoy DataCleaner 1.5.1.&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-7795807447998251428?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/7795807447998251428/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=7795807447998251428' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7795807447998251428'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7795807447998251428'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/04/datacleaner-151-released.html' title='DataCleaner 1.5.1 released'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1981468891429151586</id><published>2009-02-11T15:56:00.004+01:00</published><updated>2009-02-11T16:04:54.675+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='quality'/><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='interview'/><category scheme='http://www.blogger.com/atom/ns#' term='article'/><category scheme='http://www.blogger.com/atom/ns#' term='pro'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Data quality pro interview</title><content type='html'>Dylan Jones over at &lt;a href="http://www.dataqualitypro.com"&gt;data quality pro&lt;/a&gt; is working on a feature about &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt; and I'm very thankful for his work already (and pretty excited to see the final result). The feature have just been started with an interview with a very important person ... Me! :-) For all of those who take an interest in DataCleaner, the visions of the product and it's story I hope that you will head over there and &lt;a href="http://www.dataqualitypro.com/data-quality-home/interview-with-kasper-srensen-creator-of-datacleaner.html"&gt;read the article&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;To be continued with more posts on the &lt;a href="http://www.dataqualitypro.com"&gt;data quality pro&lt;/a&gt; articles.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1981468891429151586?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1981468891429151586/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1981468891429151586' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1981468891429151586'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1981468891429151586'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/02/dataqualitypro-interview.html' title='Data quality pro interview'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5527530855648824341</id><published>2009-01-25T16:38:00.011+01:00</published><updated>2009-01-25T17:17:10.360+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='mdx'/><category scheme='http://www.blogger.com/atom/ns#' term='olap'/><category scheme='http://www.blogger.com/atom/ns#' term='image'/><category scheme='http://www.blogger.com/atom/ns#' term='cube'/><category scheme='http://www.blogger.com/atom/ns#' term='graphics'/><category scheme='http://www.blogger.com/atom/ns#' term='icon'/><category scheme='http://www.blogger.com/atom/ns#' term='tango'/><title type='text'>Free OLAP cube icon</title><content type='html'>Okay this is a bit off-topic compared to my normal posts, but here goes.&lt;br /&gt;&lt;br /&gt;When I do websites or GUI design I usually look out for free/open source icon packages such as &lt;a href="http://tango.freedesktop.org/"&gt;Tango&lt;/a&gt; or &lt;a href="http://www.everaldo.com/crystal"&gt;Crystal&lt;/a&gt;. I have been looking for a nice-looking icon to represent an OLAP cube, preferably in a style and coloring similar to the Tango icon set. Sorry to say, I didn't find any, so I went ahead and spent some hours creating a new icon on my own. Here's the result:&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://4.bp.blogspot.com/_UpvxZrigQfQ/SXyITZaAUuI/AAAAAAAAAB8/YzFw-l_qVFo/s1600-h/cube_icon.png"&gt;&lt;img style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer; width: 120px; height: 120px;" src="http://4.bp.blogspot.com/_UpvxZrigQfQ/SXyITZaAUuI/AAAAAAAAAB8/YzFw-l_qVFo/s400/cube_icon.png" alt="" id="BLOGGER_PHOTO_ID_5295257128699712226" border="0" /&gt;&lt;/a&gt;&lt;br /&gt;And a plain version without the sum/count text elements (good if you need to resize it to very small sizes):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_UpvxZrigQfQ/SXyOvTqnBlI/AAAAAAAAACE/6kC_imaOC-0/s1600-h/cube_icon_plan.png"&gt;&lt;img style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer; width: 120px; height: 120px;" src="http://3.bp.blogspot.com/_UpvxZrigQfQ/SXyOvTqnBlI/AAAAAAAAACE/6kC_imaOC-0/s400/cube_icon_plan.png" alt="" id="BLOGGER_PHOTO_ID_5295264205264848466" border="0" /&gt;&lt;/a&gt;&lt;br /&gt;I'm giving this away under a &lt;a href="http://en.wikipedia.org/wiki/Beerware"&gt;beerware license&lt;/a&gt;, so if you want it, it's yours.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5527530855648824341?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5527530855648824341/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5527530855648824341' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5527530855648824341'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5527530855648824341'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/01/free-olap-cube-icon.html' title='Free OLAP cube icon'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_UpvxZrigQfQ/SXyITZaAUuI/AAAAAAAAAB8/YzFw-l_qVFo/s72-c/cube_icon.png' height='72' width='72'/><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-89708681990278781</id><published>2009-01-20T13:11:00.004+01:00</published><updated>2009-01-20T13:48:30.459+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open source'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='etl'/><category scheme='http://www.blogger.com/atom/ns#' term='profiler'/><category scheme='http://www.blogger.com/atom/ns#' term='enterprise'/><category scheme='http://www.blogger.com/atom/ns#' term='master data management'/><category scheme='http://www.blogger.com/atom/ns#' term='data warehousing'/><category scheme='http://www.blogger.com/atom/ns#' term='architechture'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='data integration'/><title type='text'>DataCleaner 1.5 - a heavy league Data Profiler</title><content type='html'>Often when I speak to data quality professionals and people from the business intelligence world I get the notion that most people think of Open Source tools as slightly immature when it comes to heavy processing, large loads, millions-of-rows-kinda-stuff. And this has had some truth to it. I don't want to name names, but at least I have heard a lot of stories about Open Source data integration / ETL tools that wasn't up for the job when you had millions of rows to transform. So I guess this notion have stuck to Open Source data profilers and data quality applications too...&lt;br /&gt;&lt;br /&gt;In &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt; 1.5 I want this notion demystified and eradicated! Here are some of the things we are working on to make this release a truly enterprise-ready, performance-oriented and scalable application:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;&lt;span style="font-weight: bold;"&gt;Multi-threaded, multi-connection, multi-query execution enging&lt;/span&gt;&lt;br /&gt;The execution engine in DataCleaner have been thoroughly refactored to support multithreading, multiple connections and query-splitting to perform loadbalancing on the threads and connections. This really boosts performance for large jobs and sets the bar for processing large result sets in Open Source tools I think.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="font-weight: bold;"&gt;On-disk caching for memory-intensive profiles and validation rules&lt;/span&gt;&lt;br /&gt;Some of the profiles and validation rules are almost inherently memory intensive. We are doing a lot of work optimizing them as much as we can but some thing are simply not possible to change. As an example, a Value Distribution profile simply HAS to know all distinct values of each column that is being profiled. If it doesn't - then it's not a value distribution profile. So we are implementing various degrees of on-disk caching to make this work without flooding memory. This means that the stability of DataCleaner is improved to a heavy league level.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;&lt;span style="font-weight: bold;"&gt;Batch processing and scheduling&lt;/span&gt;&lt;br /&gt;The last (but not least important) feature that I'm going to mention is the new command line interface for DataCleaner. By providing a command line interface for executing DataCleaner jobs you are able to introduce DataCleaner into a grand architecture for data quality, data warehousing, master data management or whatever it is that you are using it for. You can schedule it using any scheduling tool that you like and you can save the results to automate reporting and result analysis.&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-89708681990278781?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/89708681990278781/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=89708681990278781' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/89708681990278781'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/89708681990278781'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/01/datacleaner-15-heavy-league-data.html' title='DataCleaner 1.5 - a heavy league Data Profiler'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-8220138365784061494</id><published>2009-01-07T01:09:00.001+01:00</published><updated>2009-01-07T01:09:34.797+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='django'/><category scheme='http://www.blogger.com/atom/ns#' term='website'/><category scheme='http://www.blogger.com/atom/ns#' term='python'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Using Python and Django to build the new DataCleaner website</title><content type='html'>I have for a long time been a dedicated Java developer and in many ways, still am. But developing the new website for &lt;a href="http://datacleaner.eobjects.org"&gt;DataCleaner&lt;/a&gt; have been quite an eye-opener for the potential of dynamic languages and Python in particular. There are so many things about that language that I love and I must say that doing the same thing in Java would have taken at least twice the time! And thats even though I'm not an unexperienced Java developer.&lt;br /&gt;&lt;br /&gt;OK, so what's the big difference? Well, deployment is one very crucial difference. J2EE servers are great for stability and system administration but often I find myself, as a web developer, not needing all those things that much - I just need a server that always runs and will tell me what I am doing wrong. &lt;a href="http://www.djangoproject.org"&gt;Django&lt;/a&gt; (which have been my Python web-framework) have been excellent in doing this for me so I can kick-start my application in a matter of seconds.&lt;br /&gt;&lt;br /&gt;Type-safety is another big difference. Java is type-safe, Python (and other dynamic languages) is not. For back-end development I am a big advocate of type-safety but in front-end development dynamic classes are such a great treat! An example of this is when transfering data from Controllers to the View in the Django framework's Model-View-Controller architecture. If you want to present some domain objects that are related in the view, but not in the domain model (or perhaps the domain model has some details to it that you want to skip for understandability), then you just infer a completely new attribute into the domain object! In Java or other type-safe languages such as C#, you would typically have to create a Map for storing the new particular association and then do a lookup in the view to resolve the association. This means more "logic" in the view and code that is harder to comprehend.&lt;br /&gt;&lt;br /&gt;All in all I'm very happy to use Django. I would have liked a few more features in their &lt;a href="http://docs.djangoproject.com/en/dev/ref/models/querysets/"&gt;QuerySet API&lt;/a&gt; (especially aggregation queries, which should be on the way) but then again - for the typical website it is pretty sufficient and allows fallback to native SQL. Thank you Django.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Note&lt;/b&gt;: This is not to say that I am abandoning Java, not at all! I love Java for it's  stability and superior integrational capabilities, but in some cases, I simply want something that is fast and more in tune with the user experiencing and prototyping process of building websites.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-8220138365784061494?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/8220138365784061494/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=8220138365784061494' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8220138365784061494'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8220138365784061494'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2009/01/using-python-and-django-to-build-new_2193.html' title='Using Python and Django to build the new DataCleaner website'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-8218451069318217609</id><published>2008-11-23T00:18:00.027+01:00</published><updated>2008-11-24T13:02:09.262+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sql'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='datastore'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='hierarchy'/><category scheme='http://www.blogger.com/atom/ns#' term='file'/><category scheme='http://www.blogger.com/atom/ns#' term='xml'/><category scheme='http://www.blogger.com/atom/ns#' term='mapping'/><category scheme='http://www.blogger.com/atom/ns#' term='flattening'/><category scheme='http://www.blogger.com/atom/ns#' term='relational'/><category scheme='http://www.blogger.com/atom/ns#' term='table'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><title type='text'>How to query an XML file as if it was a relational database</title><content type='html'>For a couple of times I've spent my blogging time explaining how you can use &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel&lt;/a&gt; to &lt;a href="http://kasper.eobjects.dk/2008/11/querying-csv-file.html"&gt;query CSV files&lt;/a&gt; and &lt;a href="http://kasper.eobjects.dk/2008/11/query-your-excel-spreadsheet-with-java.html"&gt;Excel spreadsheets&lt;/a&gt; just as if they where regular databases which enable filtering, grouping, sorting etc. in their query languages (usually some form of SQL with more or less dialect).&lt;br /&gt;&lt;br /&gt;Today let's take a look at MetaModels support for XML content. XML is, to say the least, very different from relational databases. Whereas relational databases consist of tables as the basic structural element, the XML structure is hierarchical and not table-based. You could say that the structure of XML is potentially a lot more dynamic because metaphorically XML supports nesting tables inside each others to emphasize the structure and relations. So dealing with XML in a framework like &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel&lt;/a&gt; was not a no-brainer in terms of &lt;a href="http://eobjects.org/trac/discussion/7/28"&gt;design considerations and decisions&lt;/a&gt;. The main theme of MetaModel have always been &lt;b&gt;to provide a consistent model that was the same even though you had to work with different kinds o datastores&lt;/b&gt;. So how did we do this when XML and other table-based datastores are so inherently different? I'll show you using an example...&lt;br /&gt;&lt;br /&gt;Consider this compressed XML example, ie. an RSS newsfeed (coincidentally taken from &lt;a href="http://eobjects.org/trac/blog?format=rss"&gt;the eobjects.org newsfeed&lt;/a&gt;):&lt;blockquote&gt;&amp;lt;?xml version="1.0"?&amp;gt;&amp;lt;rss version="2.0"&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;channel&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;title&amp;gt;eobjects - Blog&amp;lt;/title&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;link&amp;gt;http://eobjects.org/trac/blog&lt;br /&gt;&amp;nbsp; &amp;lt;description&amp;gt;Welcome to the new eobjects.dk website&amp;lt;/description&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;item&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;title&amp;gt;MetaModel 1.1 released!&amp;lt;/title&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;pubdate&amp;gt;Wed, 05 Nov 2008 14:01:02 GMT&amp;lt;/pubdate&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;link&amp;gt;http://eobjects.org/trac/blog/metamodel-1.1-released&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;description&amp;gt;text goes here...&amp;lt;/description&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;category&amp;gt;&lt;i style="color: green;"&gt;release&lt;/i&gt;&amp;lt;/category&amp;gt;&amp;lt;category&amp;gt;&lt;i style="color: green;"&gt;license&lt;/i&gt;&amp;lt;/category&amp;gt;&amp;lt;category&amp;gt;&lt;i style="color: green;"&gt;metamodel&lt;/i&gt;&amp;lt;/category&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;/item&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;item&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;title&amp;gt;DataCleaner 1.5 "snapshot" released&amp;lt;/title&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;pubdate&amp;gt;Mon, 13 Oct 2008 07:00:13 GMT&amp;lt;/pubdate&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;link&amp;gt;http://eobjects.org/trac/blog/datacleaner-1.5-snapshot-released&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;description&amp;gt;text goes here...&amp;lt;/description&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;nbsp;   &amp;lt;category&amp;gt;&lt;i style="color: green;"&gt;release&lt;/i&gt;&amp;lt;/category&amp;gt;&amp;lt;category&amp;gt;&lt;i style="color: green;"&gt;datacleaner&lt;/i&gt;&amp;lt;/category&amp;gt;&amp;lt;category&amp;gt;&lt;i style="color: green;"&gt;license&lt;/i&gt;&amp;lt;/category&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;/item&amp;gt;&lt;br /&gt;&amp;nbsp; &amp;lt;/channel&amp;gt;&lt;br /&gt;&amp;lt;/rss&amp;gt;&lt;/blockquote&gt;&lt;img style="border: 1px solid rgb(221, 221, 221); float: right;" src="http://3.bp.blogspot.com/_UpvxZrigQfQ/SSiTXZQrujI/AAAAAAAAABw/zO48FIKHFGo/s400/rss_structure.png" alt="" id="BLOGGER_PHOTO_ID_5271625393964300850" border="0" /&gt;&lt;br /&gt;I want you to notice a few things about the RSS structure so that we get a full understanding on how MetaModel will interpret the XML file and map it to a table based structure. The resulting MetaModel structure is illustrated to the right (you will perhaps also notice that I have edited out a lot of the details in the XML code above, for brevity):&lt;ul&gt;&lt;br /&gt;&lt;li&gt;Notice that the &amp;lt;channel&amp;gt; element does not contain any data but only acts as what I would call a "wrapper tag". There are some different inner elements inside &amp;lt;channel&amp;gt; which causes MetaModel to name tables according to this shared root node. Hence the table names "channel_item", "channel_title" etc.&lt;/li&gt;&lt;br /&gt;&lt;li&gt;A lot of values are contained in inner tags within the &amp;lt;item&amp;gt; elements, like the "title", "pubDate" and "link" elements. Since these appear only once for each &amp;lt;item&amp;gt; they are considered columns in a general &amp;lt;item&amp;gt; table: "channel_item".&lt;br /&gt;&lt;/li&gt;&lt;br /&gt;&lt;li&gt;Notice the multiplicity of the &amp;lt;category&amp;gt; tags inside the &amp;lt;item&amp;gt;'s. This causes a seperate table to be created to handle multiplicity: "channel_item_category".&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;&lt;br /&gt;As you can see we do a rather intuitive "XML to table-based model" mapping in MetaModel. This is done automatically through the process that we call &lt;b&gt;auto-flattening of tables&lt;/b&gt;. If you prefer, you can also &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/XmlDataContextStrategy.html"&gt;flatten the tables manually&lt;/a&gt; if you wish to compose the table model yourself (but I'll have to dedicate a seperate blog entry to that topic sometime).&lt;br /&gt;The XML-elements to tables mapping enables you to do all the cool querying stuff with your XML files, that I have shown you so many times before. Let's do a simple example:&lt;blockquote&gt;File file = new File("newsfeed.rss");&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/DataContext.html"&gt;DataContext&lt;/a&gt; dc = &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/DataContextFactory.html"&gt;DataContextFactory&lt;/a&gt;.createXmlDataContext(file, true, true);&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/schema/Schema.html"&gt;Schema&lt;/a&gt; schema = dc.getDefaultSchema();&lt;br /&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/schema/Table.html"&gt;Table&lt;/a&gt; table = schema.getTableByName("channel_item_category");&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/schema/Column.html"&gt;Column&lt;/a&gt; col = table.getColumnByName("category");&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;// We'll make a query to show us the most popular newsitem categories:&lt;br /&gt;// SELECT category, COUNT(*) as num_items FROM channel_item_category GROUP BY category ORDER BY num_items DESC&lt;/i&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/SelectItem.html"&gt;SelectItem&lt;/a&gt; countItem = &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/SelectItem.html"&gt;SelectItem&lt;/a&gt;.getCountAllItem();&lt;br /&gt;countItem.setAlias("num_items");&lt;br /&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt; q = new &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(column).select(countItem).from(table).groupBy(column).orderBy(countItem, &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/OrderByItem.Direction.html"&gt;Direction&lt;/a&gt;.DESC);&lt;br /&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/data/DataSet.html"&gt;DataSet&lt;/a&gt; dataSet = dc.executeQuery(q)&lt;br /&gt;&lt;i style="color: green;"&gt;// Do something with the dataset!&lt;/i&gt;&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;If you're just interested in finding out how your XML format looks like after being autoflattened by MetaModel, I recommend downloading &lt;a href="http://eobjects.org/datacleaner"&gt;DataCleaner&lt;/a&gt; (from which the screenshot to the right was taken from). Over and out, enjoy &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel&lt;/a&gt;...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-8218451069318217609?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/8218451069318217609/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=8218451069318217609' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8218451069318217609'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8218451069318217609'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/11/how-to-query-xml-file-as-if-it-was.html' title='How to query an XML file as if it was a relational database'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_UpvxZrigQfQ/SSiTXZQrujI/AAAAAAAAABw/zO48FIKHFGo/s72-c/rss_structure.png' height='72' width='72'/><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4205535637114893842</id><published>2008-11-18T11:28:00.003+01:00</published><updated>2008-11-18T11:58:18.568+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open source'/><category scheme='http://www.blogger.com/atom/ns#' term='goals'/><category scheme='http://www.blogger.com/atom/ns#' term='community'/><category scheme='http://www.blogger.com/atom/ns#' term='planning'/><category scheme='http://www.blogger.com/atom/ns#' term='social construction'/><category scheme='http://www.blogger.com/atom/ns#' term='motivation'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Setting goals for volunteer Open Source projects and DataCleaner in particular</title><content type='html'>As you probably know I'm the founder and main developer of the Open Source project called &lt;a href="http://eobjects.org/datacleaner"&gt;DataCleaner&lt;/a&gt;. While there have been some notable contributions from outsiders I wouldn't be arrogant if I said that the crucial and main parts of the code-base for DataCleaner was written by me. But in a lot ways I often think of me as a &lt;span style="font-weight: bold;"&gt;medium of change&lt;/span&gt;, the bringer of the code, not the real source of the goal-setting and planning of the project. Also my main job in regards to DataCleaner have been to try and attract more developers and broaden the interest among users for DataCleaner, but that's not really relevant for this topic.&lt;br /&gt;&lt;br /&gt;In this blog post I'll investigate the relationship between me, the medium, and the community who represents the real decisionmaking entity of DataCleaner. I suspect that a lot of times people who participate in a community doesn't realize the powerful position that they possess and how they should utilize the mediums of change.&lt;br /&gt;&lt;br /&gt;Ok, so here's my main point that I want to stress:&lt;br /&gt;&lt;blockquote&gt;The goals of DataCleaner are socially constructed by the desires of the community&lt;br /&gt;&lt;/blockquote&gt;What does this mean? This means that I don't set the goals for the project myself. Actually I have been doing so a lot but only because no one else did it. If the community was to say that they wanted the project to go in a direction I would be perfectly happy to help them.&lt;br /&gt;&lt;br /&gt;What sparked me to do this blog entry was actually my friend and co-worker at &lt;a href="http://eobjects.org"&gt;eobjects.org&lt;/a&gt;, Asbjørn Leeth. When I was discussing some more or less technical changes that I was thinking of in regards to DataCleaner he said:&lt;br /&gt;&lt;blockquote&gt;"I think you should carefully consider the overall purpose of DataCleaner and where you want it to go. Who are the users and how should DataCleaner be used in a broader context?"&lt;/blockquote&gt;I absolutely agree on this quote but the thing is that I wouldn't be the one to make those decisions! I would probably have an oppinion but ultimately it's not my decision because &lt;span style="font-weight: bold;"&gt;I'm not the user, I'm just the medium of change&lt;/span&gt;. I think that the DataCleaner community should be better to involve themselves in these crucial themes, but I don't blame them because this is perhaps not a role that we are yet familiar with as Open Source actors.&lt;br /&gt;&lt;br /&gt;We need to recognize why the developers do Open Source software and when they are rewarded. This varies a lot from community to community but in the case of DataCleaner I personally get a kick out of it everytime say to me that they are using the product and that they think it rocks! This is my greatest reward. It's a lot greater than the times that people pay me money to help them with their problems (that may or may not be DataCleaner related). What does it mean that &lt;span style="font-weight: bold;"&gt;the greatest reward is the recognition of others&lt;/span&gt;? It means that you could effectively steer the development of DataCleaner simply by putting out your own goals and ideas for people like me to realize! Easy! This requires involvement and gratitude from the community but given that the community will be able to use the mediums of change in a far more effective way.&lt;br /&gt;&lt;br /&gt;One example of how this works is when &lt;a href="http://eobjects.org/trac/wiki/WhoUsesDataCleaner"&gt;Ben Bor&lt;/a&gt; some months ago sent me a list of some-20 changes and fixes that he wanted in DataCleaner. Within a few days I had effectively fixed around half of them and the rest where submitted to the roadmap so we'll work ourselves through them as we go. My point here is that while I may be the one with the &lt;span style="font-weight: bold;"&gt;easiest access to changes&lt;/span&gt; (because I know the product and code so well) I may not be the one who knows what the user wants. That being said, put very shortly here are a couple of things that I have been thinking of in regards to high-level changes in the strategy of DataCleaner:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Change the name of the application! We don't provide data cleansing. Rather we mostly do profiling and our validation engine is also rather OK, so perhaps we should think of more fitting names.&lt;/li&gt;&lt;li&gt;Remove the separation of profiling and validation in the User Interface. The User Interface should rather reflect the process and provide convenient tools to the user instead of represent the internal entities of the application.&lt;/li&gt;&lt;li&gt;These changes would definately imply a change to version 2.0 of DataCleaner because it would mean fundamental changes both to the User Interface and the core module.&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;Those are just my 25 cents in the goal-setting debate. I think for Open Source to really prosper we need user-based communities who understand that they are not just "takers of software" they are also "givers of oppinions".&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4205535637114893842?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4205535637114893842/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4205535637114893842' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4205535637114893842'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4205535637114893842'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/11/setting-goals-for-volunteer-open-source.html' title='Setting goals for volunteer Open Source projects and DataCleaner in particular'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4417874875089917393</id><published>2008-11-17T10:35:00.008+01:00</published><updated>2009-06-17T18:47:01.835+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sql'/><category scheme='http://www.blogger.com/atom/ns#' term='open source'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='spreadsheet'/><category scheme='http://www.blogger.com/atom/ns#' term='read'/><category scheme='http://www.blogger.com/atom/ns#' term='file'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='excel'/><title type='text'>Query your Excel spreadsheet with Java</title><content type='html'>&lt;p&gt;The &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel project&lt;/a&gt; lets you do wonderful and advanced things like filtering, ordering, grouping etc. when working with otherwise static content in CSV files, Excel spreadsheets, XML files etc. Many times I have knocked my head to the door when trying to get simple summary data out of my excel spreadsheets or simply filtering on single rows in the sheets.&lt;/p&gt;&lt;p&gt;&lt;b&gt;An example:&lt;/b&gt; Recently I was working on my tax return statement and throughout the year I had recorded all relevant activities in a simple spreadsheet. I had marked the activities on various accounts such as representation, spendings, earnings whatever. Simplified it looked something like this:&lt;/p&gt;&lt;table&gt;&lt;tr&gt;&lt;th&gt;Title&lt;/th&gt;&lt;th&gt;Amount&lt;/th&gt;&lt;th&gt;Account&lt;/th&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Asus EEE&lt;/td&gt;&lt;td&gt;1.600&lt;/td&gt;&lt;td&gt;Spendings&lt;/td&gt;&lt;/tr&gt;&lt;tr&gt;&lt;td&gt;Advisory for Company X&lt;/td&gt;&lt;td&gt;4.000&lt;/td&gt;&lt;td&gt;Earnings&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;&lt;p&gt;One thing I think is funny when dealing with Excel is that throughout time you've learned how to set up your spreadsheet for easy hacking instead of dynamic querying. What I mean by this is that if I wanted to get SUM for each account in my spreadsheet I would propbably have fundamentally changed my (otherwise nice and pure) spreadsheet in order to make it easy to perform the SUM function on singular accounts. For example I may have introduced an ammount column for each account. But alas, adding a new Account would be pretty cumbersome.&lt;/p&gt;&lt;p&gt;&lt;b&gt;Enter MetaModel!&lt;/b&gt; With &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel&lt;/a&gt; I can write queries in a SQL-like manner. When I say "SQL-like" you may fret and think "oh but don't learn me another SQL dialect, just give me the real deal!". The answer to this is that what I'll learn you now will effectively replace all SQL dialects because the same model (or MetaModel ;)) is usable for all kinds of datastores: Excel, CSV/TSV files, XML and SQL-databases). Let's have a look at some code. First we'll load the Excel file into MetaModel and get a hold of our schema and table structure and identify our columns of interest:&lt;/p&gt;&lt;blockquote&gt;&lt;i style="color: green;"&gt;// Use the factory to create a DataContext and automatically (true) narrow column types&lt;/i&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/DataContext.html"&gt;DataContext&lt;/a&gt; dataContext = &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/DataContextFactory.html"&gt;DataContextFactory&lt;/a&gt;.createExcelDataContext(new File("my_tax_return_activities.xls"), true);&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/schema/Schema.html"&gt;Schema&lt;/a&gt; schema = dataContext.getDefaultSchema();&lt;br /&gt;&lt;i style="color: green;"&gt;// A simple way of getting the table object is just to pick the first one (the first sheet in the spreadsheet)&lt;/i&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/schema/Table.html"&gt;Table&lt;/a&gt; table = schema.getTables()[0];&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/schema/Column.html"&gt;Column&lt;/a&gt; amountColumn = table.getColumnByName("Amount");&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/schema/Column.html"&gt;Column&lt;/a&gt; accountColumn = table.getColumnByName("Account");&lt;/blockquote&gt;&lt;p&gt;Now that we have the columns represented and the &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/DataContext.html"&gt;DataContext&lt;/a&gt; which we can use to perform queries, here's how we make a couple of interesting queries using the &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/package-summary.html"&gt;MetaModel query API&lt;/a&gt;:&lt;/p&gt;&lt;blockquote&gt;&lt;i style="color: green;"&gt;// SELECT SUM(Amount) FROM sheet WHERE Account = "Spendings"&lt;/i&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt; q = new &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(new SelectItem(&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/FunctionType.html"&gt;FunctionType&lt;/a&gt;.SUM, amountColumn)).from(table).where(accountColumn, &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/OperatorType.html"&gt;OperatorType&lt;/a&gt;.EQUALS_TO, "Spendings");&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;// SELECT Account, SUM(Amount) FROM sheet GROUP BY Account&lt;/i&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt; q = new &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(accountColumn).select(new SelectItem(&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/FunctionType.html"&gt;FunctionType&lt;/a&gt;.SUM, amountColumn)).from(table).groupBy(accountColumn);&lt;br /&gt;&lt;/blockquote&gt;&lt;p&gt;Now when we've authored the queries we can let MetaModel execute them and deal with the result as appropriate. If we just want to print out the results to the console we'll do something like this:&lt;/p&gt;&lt;blockquote&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/data/DataSet.html"&gt;DataSet&lt;/a&gt; dataSet = dataContext.executeQuery(q);&lt;br /&gt;while (dataSet.next()) {&lt;br /&gt; &amp;nbsp; &amp;nbsp; System.out.println(dataSet.getRow());&lt;br /&gt;}&lt;br /&gt;dataSet.close();&lt;/blockquote&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4417874875089917393?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4417874875089917393/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4417874875089917393' title='24 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4417874875089917393'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4417874875089917393'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/11/query-your-excel-spreadsheet-with-java.html' title='Query your Excel spreadsheet with Java'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>24</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-8553734485287250447</id><published>2008-11-11T06:35:00.032+01:00</published><updated>2008-11-14T16:58:37.755+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='sql'/><category scheme='http://www.blogger.com/atom/ns#' term='open source'/><category scheme='http://www.blogger.com/atom/ns#' term='csv'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='datastore'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='file'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><title type='text'>Querying a CSV file!</title><content type='html'>&lt;p&gt;Today I'm going to demonstrate some of the functionality of &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel&lt;/a&gt; 1.1, which provides a &lt;b&gt;datastore-transparent querying API&lt;/b&gt; for Java. Actually what I'll show you isn't all that new, we've been able to do most of the querying stuff for CSV files (and other datastores) roughly since MetaModel 1.0, but it seems to me that too few realize what a powerful tool we have with MetaModel, so I'm just going to repeat myself a bit here ;) Also, I'll demonstrate on of the new cool features of MetaModel 1.1 - column type &lt;b&gt;detection&lt;/b&gt;, &lt;b&gt;narrowing&lt;/b&gt; and value &lt;b&gt;transformation&lt;/b&gt;.&lt;/p&gt;&lt;p&gt;For this example I'll use some real data: I've extracted &lt;a href="http://eobjects.org/svn/MetaModel/trunk/MetaModel-csv/src/test/resources/tickets.csv"&gt;this CSV file&lt;/a&gt; from the eobjects.org trac system. It contains a list of all the tickets (issues, bugs, tasks, whatever) that are active at the time of writing... You'll notice if you take a look at the file, that it's not exactly a simple CSV file - a lot of text spans multiple lines and there are quite a lot of different data types.&lt;/p&gt;&lt;p&gt;Ok, so let's get started. I've saved the data to a file: &lt;b&gt;tickets.csv&lt;/b&gt;. Now I want to read the file and let MetaModel generate a metadata model based on it. I will also let MetaModel try and detect the column types (since CSV only contains text-types natively) which will automatically transform all values to the most fitting and narrow type that MetaModel can find (this is indicated by the 'true' parameter in the code  below). Here's how we get a hold of the datastore model for the file:&lt;/p&gt;&lt;blockquote&gt;File file = new File("tickets.csv");&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/DataContext.html"&gt;DataContext&lt;/a&gt; dataContext = &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/DataContextFactory.html"&gt;DataContextFactory&lt;/a&gt;.createCsvDataContext(file, true);&lt;br /&gt;&lt;/blockquote&gt;&lt;p&gt;Once we have a &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/DataContext.html"&gt;DataContext&lt;/a&gt; object we are ready to go for our &lt;b&gt;datastore-transparent&lt;/b&gt; way of querying. What we do is: We get a hold of the schema of our CSV file and we locate the table of interest. Since CSV is a single-table datastore type, getting the table of interest can be done in two ways:&lt;/p&gt;&lt;blockquote&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/schema/Table.html"&gt;Table&lt;/a&gt; table;&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//get table by name&lt;/i&gt;&lt;br /&gt;table = schema.getTableByName("tickets");&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//get table by index&lt;/i&gt;&lt;br /&gt;table = schema.getTables()[0];&lt;br /&gt;&lt;/blockquote&gt;&lt;p&gt;Now we can go ahead and investigate the structure of the CSV file. Since we turned on automatic column type narrowing you will see that the 'ticket' column have been converted from a text-based column to an INTEGER type. Also, as MetaModel can verify that 'ticket'-values are never missing, it is asserted that the column is not nullable:&lt;/p&gt;&lt;blockquote&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/schema/Column.html"&gt;Column&lt;/a&gt; ticketColumn = table.getColumnByName("ticket");&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//Will print: "Ticket column type: INTEGER"&lt;/i&gt;&lt;br /&gt;System.out.println("Ticket column type: " + ticketColumn.getType());&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//Will print: "Ticket column nullable: false"&lt;/i&gt;&lt;br /&gt;System.out.println("Ticket column nullable: " + ticketColumn.isNullable());&lt;br /&gt;&lt;/blockquote&gt;&lt;p&gt;And now for the fun and impressing part... Let's try to make some queries! Here are a couple of examples:&lt;/p&gt;&lt;blockquote&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt; q;&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//SELECT * FROM tickets ORDER BY ticket&lt;/i&gt;&lt;br /&gt;q = new &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(table.getColumns()).from(table).orderBy(ticketColumn);&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//SELECT SUM(ticket) FROM tickets&lt;/i&gt;&lt;br /&gt;q = new &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/FunctionType.html"&gt;FunctionType&lt;/a&gt;.SUM, ticketColumn).from(table);&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//SELECT _reporter AS rep_name, COUNT(*) AS num_tickets FROM tickets GROUP BY rep_name&lt;/i&gt;&lt;br /&gt;Column reporterColumn = table.getColumnByName("_reporter");&lt;br /&gt;q = new &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(reporterColumn,"rep_name).selectCount().from(table).groupBy(reporterColumn);&lt;br /&gt;&lt;/blockquote&gt;&lt;p&gt;To execute the queries is very simple - just ask your DataContext object to execute the query object. MetaModel will then analyze the query and process the result behind the scenes:&lt;/p&gt;&lt;blockquote&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/data/DataSet.html"&gt;DataSet&lt;/a&gt; dataSet = dataContext.executeQuery(q);&lt;br /&gt;&lt;/blockquote&gt;&lt;p&gt;We can process MetaModel DataSets in quite a few ways. A typical way would be to &lt;b&gt;iterate&lt;/b&gt; through it, similar to a &lt;b&gt;ResultSet&lt;/b&gt;:&lt;/p&gt;&lt;blockquote&gt;while (dataSet.next()) {&lt;br /&gt;    &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/data/Row.html"&gt;Row&lt;/a&gt; row = dataSet.getRow();&lt;br /&gt;    &lt;i style="color: green;"&gt;//Extract values or do something similar with the row&lt;/i&gt;&lt;br /&gt;    System.out.println("row: " + row);&lt;br /&gt;}&lt;br /&gt;&lt;/blockquote&gt;&lt;p&gt;... Or we can &lt;b&gt;transform&lt;/b&gt; the DataSet into a &lt;b&gt;TableModel&lt;/b&gt; object:&lt;/p&gt;&lt;blockquote&gt;&lt;a href="http://java.sun.com/j2se/1.5.0/docs/api/javax/swing/table/TableModel.html"&gt;TableModel&lt;/a&gt; tableModel = dataSet.toTableModel();&lt;br /&gt;&lt;/blockquote&gt;&lt;p&gt;... Or we can &lt;b&gt;transform&lt;/b&gt; it into a list of Object arrays:&lt;/p&gt;&lt;blockquote&gt;List&amp;lt;Object[]&amp;gt; objectArrays = dataSet.toObjectArrays();&lt;/blockquote&gt;&lt;p&gt;As you can see, with the &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/"&gt;MetaModel API&lt;/a&gt; a lot of things that used to be difficult is now really, really easy!&lt;/p&gt;&lt;p&gt;&lt;a href="#fullExample1301" onclick="javascript:document.getElementById('fullExample1301').style.display='block';"&gt;Click here&lt;/a&gt; to display the full example.&lt;/p&gt;&lt;blockquote id="fullExample1301" style="display: none;"&gt;File file = new File("tickets.csv");&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/DataContext.html"&gt;DataContext&lt;/a&gt; dataContext = &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/DataContextFactory.html"&gt;DataContextFactory&lt;/a&gt;.createCsvDataContext(file, true);&lt;br /&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/schema/Table.html"&gt;Table&lt;/a&gt; table;&lt;br /&gt;&lt;i style="color: green;"&gt;//get table by name&lt;/i&gt;&lt;br /&gt;table = schema.getTableByName("tickets");&lt;br /&gt;&lt;i style="color: green;"&gt;//get table by index&lt;/i&gt;&lt;br /&gt;table = schema.getTables()[0];&lt;br /&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/schema/Column.html"&gt;Column&lt;/a&gt; ticketColumn = table.getColumnByName("ticket");&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//Will print: "Ticket column type: INTEGER"&lt;/i&gt;&lt;br /&gt;System.out.println("Ticket column type: " + ticketColumn.getType());&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//Will print: "Ticket column nullable: false"&lt;/i&gt;&lt;br /&gt;System.out.println("Ticket column nullable: " + ticketColumn.isNullable());&lt;br /&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt; q;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/data/DataSet.html"&gt;DataSet&lt;/a&gt; dataSet;&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//SELECT * FROM tickets ORDER BY ticket&lt;/i&gt;&lt;br /&gt;q = new &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(table.getColumns()).from(table).orderBy(ticketColumn);&lt;br /&gt;dataSet = dataContext.executeQuery(q);&lt;br /&gt;&lt;br /&gt;while (dataSet.next()) {&lt;br /&gt; &amp;nbsp; &amp;nbsp; &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/data/Row.html"&gt;Row&lt;/a&gt; row = dataSet.getRow();&lt;br /&gt; &amp;nbsp; &amp;nbsp; &lt;i style="color: green;"&gt;//Extract values or do something similar with the row&lt;/i&gt;&lt;br /&gt; &amp;nbsp; &amp;nbsp; System.out.println("row: " + row);&lt;br /&gt;}&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//SELECT SUM(ticket) FROM tickets&lt;/i&gt;&lt;br /&gt;q = new &lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(&lt;a href="http://eobjects.org/metamodel/apidocs/1.1/dk/eobjects/metamodel/query/FunctionType.html"&gt;FunctionType&lt;/a&gt;.SUM, ticketColumn).from(table);&lt;br /&gt;dataSet = dataContext.executeQuery(q);&lt;br /&gt;&lt;br /&gt;&lt;a href="http://java.sun.com/j2se/1.5.0/docs/api/javax/swing/table/TableModel.html"&gt;TableModel&lt;/a&gt; tableModel = dataSet.toTableModel();&lt;br /&gt;&lt;br /&gt;&lt;i style="color: green;"&gt;//SELECT _reporter AS rep_name, COUNT(*) AS num_tickets FROM tickets GROUP BY rep_name&lt;/i&gt;&lt;br /&gt;Column reporterColumn = table.getColumnByName("_reporter");&lt;br /&gt;q = new Query().select(reporterColumn,"rep_name).selectCount().from(table).groupBy(reporterColumn);&lt;br /&gt;dataSet = dataContext.executeQuery(q);&lt;br /&gt;&lt;br /&gt;List&amp;lt;Object[]&amp;gt; objectArrays = dataSet.toObjectArrays();&lt;br /&gt;&lt;/blockquote&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-8553734485287250447?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/8553734485287250447/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=8553734485287250447' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8553734485287250447'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8553734485287250447'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/11/querying-csv-file.html' title='Querying a CSV file!'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-7522684236900793201</id><published>2008-11-05T15:45:00.002+01:00</published><updated>2008-11-05T15:47:57.706+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open source'/><category scheme='http://www.blogger.com/atom/ns#' term='datastore'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='release'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><title type='text'>MetaModel 1.1 released!</title><content type='html'>I just ditched my masters thesis today to work on good 'ol &lt;a href="http://eobjects.org/"&gt;MetaModel&lt;/a&gt;! I spent my time straightening out the last remaining tasks on the 1.1 release which I have been looking forward to for some time now. So I'm now happy to announce that&lt;br /&gt;&lt;br /&gt;&lt;big&gt;MetaModel 1.1 have just been released!&lt;/big&gt;&lt;br /&gt;&lt;br /&gt;Head over to &lt;a href="http://eobjects.org/news"&gt;the eobjects.org news site&lt;/a&gt; to learn more about what this release is all about!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-7522684236900793201?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/7522684236900793201/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=7522684236900793201' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7522684236900793201'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7522684236900793201'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/11/metamodel-11-released.html' title='MetaModel 1.1 released!'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2457041210498043403</id><published>2008-10-20T18:40:00.004+02:00</published><updated>2008-10-21T19:44:16.134+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open source'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality'/><category scheme='http://www.blogger.com/atom/ns#' term='marketing'/><category scheme='http://www.blogger.com/atom/ns#' term='data quality pro'/><category scheme='http://www.blogger.com/atom/ns#' term='master data management'/><category scheme='http://www.blogger.com/atom/ns#' term='business intelligence'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Open Source acknowledged by the Data Quality community?</title><content type='html'>Here in Denmark I often feel that the Data Quality, Master Data Management and Business Intelligence field is pretty fearsome towards Open Source software. I think this largely has to do with lack of presence, a lot of prejudices and few established consultancy firms in this part of the world.&lt;br /&gt;&lt;br /&gt;This is also why it's always a great surprise, and a good one, when you catch the interest of the international Data Quality venue. Last week I was in a correspondence with the guys over at &lt;a href="http://www.dataqualitypro.com/"&gt;Data Quality Pro&lt;/a&gt; who was building a new &lt;a href="http://www.dataqualitypro.com/open-source-data-quality/"&gt;Open Source Data Quality page&lt;/a&gt; and we had a nice chat about the tools available and the opportunities out there. I'm very glad that people are showing interest and hopefully the Danish BI scene will also adapt to the wonderful world of Open Source software as we go along...&lt;br /&gt;&lt;br /&gt;Talking about getting the word out, where should you "advertise" your Open Source product to the business world? I personally think it's hard work to market your product even though you're giving it away for free and you would think that people automatically rushed to your website ;) Of course everyone needs to be aware that the software is here and I try a lot to put the word out there on &lt;a href="http://kasper.eobjects.dk/2008/10/fast-as-lightning.html"&gt;conferences&lt;/a&gt;, &lt;a href="http://en.wikipedia.org/wiki/DataCleaner"&gt;wikipedia&lt;/a&gt;, &lt;a href="https://sourceforge.net/projects/datacleaner/"&gt;sourceforge&lt;/a&gt;, &lt;a href="http://freshmeat.net/projects/datacleaner"&gt;freshmeat&lt;/a&gt;, &lt;a href="http://www.ohloh.net/projects/datacleaner"&gt;ohloh&lt;/a&gt; etc.. But let me raise this question to everybody involved with marketing software for no cost: How do you do it?&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-2457041210498043403?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2457041210498043403/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=2457041210498043403' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2457041210498043403'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2457041210498043403'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/10/open-source-acknowledged-by-data.html' title='Open Source acknowledged by the Data Quality community?'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2005802330528441934</id><published>2008-10-17T17:08:00.006+02:00</published><updated>2008-10-17T17:37:36.746+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open source'/><category scheme='http://www.blogger.com/atom/ns#' term='wikipedia'/><category scheme='http://www.blogger.com/atom/ns#' term='master'/><category scheme='http://www.blogger.com/atom/ns#' term='process'/><category scheme='http://www.blogger.com/atom/ns#' term='education'/><category scheme='http://www.blogger.com/atom/ns#' term='academia'/><category scheme='http://www.blogger.com/atom/ns#' term='thesis'/><category scheme='http://www.blogger.com/atom/ns#' term='project'/><title type='text'>Why can't my masters thesis be more like my Open Source project?</title><content type='html'>&lt;p&gt;Being a hard working student I sometimes have to question (and from time to time applaude) the practices of academia and the tools that we use to foster innovation, creativity and knowledge-sharing. My masters thesis subject is concerned with the development methods of open source communities and companies that try to enable community-based development of their products. Yesterday I was considering this quote:&lt;/p&gt;&lt;blockquote&gt;"In the cathedral-builder view of programming, bugs and development problems are tricky, insidious, deep phenomena. It takes months of scrutiny by a dedicated few to develop confidence that you've winkled them all out." - &lt;a href="http://www.catb.org/~esr/writings/cathedral-bazaar/cathedral-bazaar/ar01s04.html"&gt;Eric S. Raymond - The Cathredral and The Bazaar&lt;/a&gt;&lt;/blockquote&gt;&lt;p&gt;Looking aside from the fact that it deals with programming and not writing a paper (and trying to grow global awareness and knowledge on a specific topic) I thought to myself, that the cathedral-builder process is pretty similar to the process of writing a masters thesis. There are pretty strict guidelines to follow, a lot of scrutiny involved and planning by a dedicated few - in my case myself and my supervisor.&lt;/p&gt;&lt;p&gt;So is there a room for another way, a more open process with distributed peers, continous redesign, short release-spans etc. Obviously there are things like &lt;a href="http://www.wikipedia.org"&gt;wikipedia&lt;/a&gt; that provide this for topics of interest to the general public, but needless to say science projects often go beyond that level of information and have to deal with experiments, not just facts of life such as those in an encyclopedia.&lt;/p&gt;&lt;p&gt;Also there's the issue of academia culture. Ego and elitism doubtlessly play a big part in maintaining a high degree of secrecy and closeness of scientific endeavor. I'd love to see scientists work in a community-enabling fashion and then I'd love to contribute to one of those (or create my own for my masters these). Let's for example try something like this out and we'll be well under way:&lt;/p&gt;&lt;ul&gt;&lt;li&gt;"Bugtrackers" for all the items that needs investigations&lt;/li&gt;&lt;li&gt;"Source control management" and versioning systems for revisions of the paper(s)&lt;/li&gt;&lt;li&gt;Chatty mailing lists for peer review and discussions&lt;/li&gt;&lt;li&gt;"Continous integration" for managing/matching references and terms within the paper&lt;/li&gt;&lt;li&gt;Free availability to all underlying data, not just the published parts&lt;/li&gt;&lt;/ul&gt;&lt;p&gt;The beauties of this would be similar to the beauties of open source. And particularly in academia there's a strong need to be able to track down who's done what and source control management and reference-management would greatly improve on that account. For evaluators it would be possible to see the actual changes made by each student in group work, for group-working students it would be possible to track the actual changes to the project (as opposed to having to read it all over again everytime you exchange documents)... Perhaps a more "open" science would be just what we need?&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-2005802330528441934?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2005802330528441934/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=2005802330528441934' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2005802330528441934'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2005802330528441934'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/10/why-cant-my-masters-thesis-be-more-like.html' title='Why can&apos;t my masters thesis be more like my Open Source project?'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-6820597862836482551</id><published>2008-10-13T20:39:00.003+02:00</published><updated>2008-10-13T20:48:02.843+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='release'/><category scheme='http://www.blogger.com/atom/ns#' term='lgpl'/><category scheme='http://www.blogger.com/atom/ns#' term='openoffice'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>A day of releases!</title><content type='html'>&lt;div&gt;I saw this morning that the new &lt;a href="http://openoffice.org/"&gt;OpenOffice 3&lt;/a&gt; is out! Congratulations to the OO.o crew, I'm enjoying using your product for my upcoming masters thesis :)&lt;/div&gt;&lt;br /&gt;&lt;div&gt;Today is also the day that DataCleaner 1.5 "snapshot" has been released. Here's the press release:&lt;/div&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;div&gt;As we're moving steadily along towards the release of DataCleaner 1.5 we are fixing a few bugs and enhancing a lot of features. This leads to the desire to release our work since practically nothing has undergone changes that could destabilize the application since the 1.4 release. So today we're releasing DataCleaner 1.5 "snapshot". This also marks the first release under our new LGPL license.&lt;/div&gt;&lt;div&gt;&lt;br /&gt;&lt;/div&gt;&lt;div&gt;Here are the changes from 1.4 so far:&lt;/div&gt;&lt;ul&gt;&lt;li&gt;Change of license to LGPL.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;New profile: Date mask matcher.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;New profile: Regex matcher.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;More file types supported (.dat, .txt)&lt;br /&gt;&lt;/li&gt;&lt;li&gt;XML file support improved (.xml)&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;&lt;div&gt;Although this is in principle a development/beta release, we feel that it would be worth working with for most of your profiling needs. So... Go on, &lt;a href="http://eobjects.org/trac/wiki/GetDataCleaner"&gt;download it&lt;/a&gt;, &lt;a href="http://eobjects.org/trac/discussion/1"&gt;tell us what you think&lt;/a&gt; and we'll see you around!&lt;/div&gt;&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;I hope you all enjoy the new version of DataCleaner!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-6820597862836482551?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/6820597862836482551/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=6820597862836482551' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/6820597862836482551'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/6820597862836482551'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/10/day-of-releases.html' title='A day of releases!'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-8338680681287948338</id><published>2008-10-04T15:20:00.003+02:00</published><updated>2008-10-04T18:40:38.114+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='days'/><category scheme='http://www.blogger.com/atom/ns#' term='open'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='08'/><category scheme='http://www.blogger.com/atom/ns#' term='source'/><category scheme='http://www.blogger.com/atom/ns#' term='presentation'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Fast as lightning!</title><content type='html'>Whoa! I just got through my Lightning Speak about &lt;a href="http://eobjects.org/"&gt;eobjects.org&lt;/a&gt; and &lt;a href="http://eobjects.org/datacleaner"&gt;DataCleaner&lt;/a&gt; at the &lt;a href="http://opensourcedays.org/"&gt;Open Source Days '08&lt;/a&gt; conference about an hour ago. It was a great experience - very fun and kinda stressing (in the good, "get to the point"-kinda way) to have an alarm clock counting down for your 15 minutes of fame!&lt;br /&gt;&lt;br /&gt;And in deed my presentation was very closely to the point. I wanted to tell people about the great creative projects at &lt;a href="http://eobjects.org"&gt;eobjects.org&lt;/a&gt; and especially about &lt;a href="http://eobjects.org/datacleaner"&gt;DataCleaner&lt;/a&gt; and the &lt;a href="http://eobjects.org/metamodel"&gt;MetaModel&lt;/a&gt; project, which I dubbed a "derivative" project. My speak also quickly sketched the domain of data quality and people where nodding when I concluded that they all should download DataCleaner and give their datasources a quick profile the next time they worked on their projects.&lt;br /&gt;&lt;br /&gt;You can download my slides here: &lt;a href="http://eobjects.org/resources/download/opensourcedays.pdf"&gt;http://eobjects.org/resources/download/opensourcedays.pdf&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Unfortunately the format of the Ligthning Speak didn't allow for much time for comments and questions from the audience, but I hope and think that they had a good time!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-8338680681287948338?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/8338680681287948338/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=8338680681287948338' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8338680681287948338'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/8338680681287948338'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/10/fast-as-lightning.html' title='Fast as lightning!'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-7398746268627741053</id><published>2008-09-08T15:31:00.005+02:00</published><updated>2008-09-08T15:42:47.012+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='days'/><category scheme='http://www.blogger.com/atom/ns#' term='conference'/><category scheme='http://www.blogger.com/atom/ns#' term='open'/><category scheme='http://www.blogger.com/atom/ns#' term='lightning'/><category scheme='http://www.blogger.com/atom/ns#' term='source'/><category scheme='http://www.blogger.com/atom/ns#' term='speak'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='2008'/><title type='text'>Presenting DataCleaner at Open Source Days '08</title><content type='html'>The &lt;a href="http://www.opensourcedays.org/"&gt;Open Source Days&lt;/a&gt; conference in Copenhagen have been growing steadily for the last couple of years. Some of you may know it by it's former name, LinuxForum, though. This year the conference is featuring a new concept: &lt;a href="http://www.opensourcedays.org/2008/agenda/lightning.shtml"&gt;Lightning speaks&lt;/a&gt;. Lightning speaks are less comprehensive speaks by less profiled speakers that are not necessarily based upon a paper. This doesn't mean that the speaks will be less relevant, on the contrary I think we will see a lot of hyper-interesting "up and coming" speakers delivering early keynotes of major topics to come.&lt;br /&gt;&lt;br /&gt;As you may have guessed by my enthusiasm about the Lightning Speak concept, I am speaking at this years Open Source Days conference on Sunday the 4th of October at 14:00! The topic will be DataCleaner and how the &lt;a href="http://eobjects.dk/"&gt;eobjects.dk&lt;/a&gt; (and now &lt;a href="http://eobjects.org/"&gt;eobjects.org&lt;/a&gt;) community have delivered one of the leading open source Data Quality solutions within less than a year! Here's the official speak statement:&lt;br /&gt;&lt;blockquote&gt;DataCleaner is the most advanced open source data quality solution available. You can use DataCleaner to profile, validate and compare your data in an intuitive graphical environment. The application is compliant with lots of datasources such as JDBC databases, CSV files, Excel spreadsheets, OpenOffice.org database-files and xml files. Among the interesting features are identification of string patterns, value distributions, dictionary lookups, javascripted validation rules, regular expression validation along with traditional metrics used for data profiling and analysis. Come to this lightning speak to listen to Kasper Sørensen, the founder of eobjects.dk and main developer of DataCleaner who will demonstrate DataCleaner and give a quick overview of what's going on with open source data quality. &lt;/blockquote&gt;I hope that if you're joining the conference you'll come and hear me out. And if you're not planning to come - then please reconsider ;) I'm very much looking forward to showing of our great product and I think it will be a lovely experience to get some more focus on data quality as well.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-7398746268627741053?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/7398746268627741053/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=7398746268627741053' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7398746268627741053'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7398746268627741053'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/09/presenting-datacleaner-at-open-source.html' title='Presenting DataCleaner at Open Source Days &apos;08'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2270154759681320050</id><published>2008-08-26T16:23:00.005+02:00</published><updated>2008-08-27T09:36:28.653+02:00</updated><title type='text'>We're moving eobjects.dk to a new server</title><content type='html'>Hello everybody. This is a practical announcement ...&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight: bold;font-size:100%;" &gt;We're in the process of moving eobjects.dk to a new server.&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;Please return shortly to access the new and improved eobjects.dk!&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight: bold;font-size:100%;" &gt;Update&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;We're finally finished, but the IP address change will need some hours to cascade worldwide. If you're still being redirected to this page it's because the DNS changes haven't set in yet. The new &lt;a href="http://www.eobjects.dk/"&gt;eobjects.dk&lt;/a&gt; website is online! Impatient people who are suffering from the slow DNS cascades can also access the &lt;a href="http://130.226.47.207/"&gt;website directly by it's IP address&lt;/a&gt; (minor glitches is to be expected this way though).&lt;br /&gt;&lt;span style="font-size:100%;"&gt;&lt;br /&gt;&lt;/span&gt;&lt;span style="font-weight: bold;font-size:100%;" &gt;Update&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;By now, all DNS changes should be complete så go ahead&lt;br /&gt;&lt;span style="font-size:180%;"&gt;&lt;a style="font-weight: bold;" href="http://www.eobjects.dk"&gt;enjoy the new eobjects.dk website&lt;/a&gt;&lt;/span&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-2270154759681320050?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2270154759681320050/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=2270154759681320050' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2270154759681320050'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2270154759681320050'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/08/were-moving-eobjectsdk-to-new-server.html' title='We&apos;re moving eobjects.dk to a new server'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-7615968075401386703</id><published>2008-08-25T21:15:00.003+02:00</published><updated>2008-08-25T21:23:51.211+02:00</updated><title type='text'>Development/snapshot release of DataCleaner 1.4</title><content type='html'>We've released a development/snapshot release of &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt; 1.4 in order to get early reactions for all the improvements and new features as well as supporting our users with up to date functionality. In my own opinion the development release is just as stable and "safe to use" as 1.3, but of course it lacks a bit of the manual testing that we put into the real releases.&lt;br /&gt;&lt;br /&gt;You can download the development release at our &lt;a href="http://sourceforge.net/project/showfiles.php?group_id=217469"&gt;sourceforge download site&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;Here's a short list of fixes since DataCleaner 1.3:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Better memory handling and garbage collection&lt;/li&gt;&lt;li&gt;Reference columns in drill-to-details windows&lt;/li&gt;&lt;li&gt;Better error handling when loading schemas&lt;/li&gt;&lt;li&gt;Quoting of string values in visualized tables (in order to distinguish empty strings and white spaces)&lt;/li&gt;&lt;li&gt;New profile: Value Distribution, which is an improved version of the Repeated Values profile. The Value Distribution profile has an option to configure the top/bottom n values to include in the result.&lt;/li&gt;&lt;li&gt;Better control of profile result column width.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;Bugfix: Copy to clipboard functions now work properly.&lt;/li&gt;&lt;li&gt;Bugfix: Scrollbars added to visualized tables.&lt;/li&gt;&lt;/ul&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-7615968075401386703?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/7615968075401386703/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=7615968075401386703' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7615968075401386703'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7615968075401386703'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/08/developmentsnapshot-release-of.html' title='Development/snapshot release of DataCleaner 1.4'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2859183485796917055</id><published>2008-08-19T18:03:00.004+02:00</published><updated>2008-08-19T18:14:08.528+02:00</updated><title type='text'>New eobjects.dk website</title><content type='html'>Hi everybody,&lt;br /&gt;&lt;br /&gt;I'm anticipating the release of a new &lt;a href="http://www.eobjects.dk/trac"&gt;eobjects.dk&lt;/a&gt; website design. The website will be launched pretty soon I hope - we're doing it as a part of a general server move. The move is to the laboratory at Copenhagen Business School (&lt;a href="http://www.cbs.dk/"&gt;CBS&lt;/a&gt;) called Business of Open Source Software and Standards (&lt;a href="http://bosss.inf.cbs.dk/"&gt;BOSSS&lt;/a&gt;). With BOSSS we'll have a much better bandwidth and a better performing server as well as better physical security conditions.&lt;br /&gt;&lt;br /&gt;The new website will be based on &lt;a href="http://trac.edgewall.org/"&gt;trac&lt;/a&gt; 0.11 (currently we use 0.10) and will feature a lot of improvements for visitors, users and contributors:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Better theming engine has enabled us to use a more flexible website design with wiki pages appearing as menu items.&lt;/li&gt;&lt;li&gt;A whole new news page which will be used to perform announcements on the progress of our projects.&lt;/li&gt;&lt;li&gt;A lot of other &lt;a href="http://trac.edgewall.org/wiki/TracDev/ReleaseNotes/0.11"&gt;improvements&lt;/a&gt; caused by the trac upgrade.&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;Here's a little screenshot of the new webpage design (work in progress):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_UpvxZrigQfQ/SKrvCTd8VqI/AAAAAAAAABk/bQdsHWDmgbY/s1600-h/Screenshot.jpg"&gt;&lt;img style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer;" src="http://2.bp.blogspot.com/_UpvxZrigQfQ/SKrvCTd8VqI/AAAAAAAAABk/bQdsHWDmgbY/s400/Screenshot.jpg" alt="" id="BLOGGER_PHOTO_ID_5236260339636262562" border="0" /&gt;&lt;/a&gt;&lt;br /&gt;Any comments are welcome!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-2859183485796917055?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2859183485796917055/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=2859183485796917055' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2859183485796917055'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2859183485796917055'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/08/new-eobjectsdk-website.html' title='New eobjects.dk website'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_UpvxZrigQfQ/SKrvCTd8VqI/AAAAAAAAABk/bQdsHWDmgbY/s72-c/Screenshot.jpg' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3317800987764894384</id><published>2008-08-03T23:45:00.004+02:00</published><updated>2008-08-04T00:07:40.821+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='type-casting'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='rewriting'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='feature'/><category scheme='http://www.blogger.com/atom/ns#' term='select'/><category scheme='http://www.blogger.com/atom/ns#' term='extraction'/><category scheme='http://www.blogger.com/atom/ns#' term='functions'/><title type='text'>Considering MetaModel functions</title><content type='html'>This blog entry could just as well have been a feature request but I'm going to kick-start it with a couple of thoughts I have for one of the crucial improvements to &lt;a href="http://www.eobjects.dk/metamodel"&gt;MetaModel&lt;/a&gt; that I've been dreaming about.&lt;br /&gt;&lt;br /&gt;The last couple of weeks have brought considerable interest in MetaModel, largely thanks to articles posted on &lt;a href="http://www.theserverside.com/news/thread.tss?thread_id=50073"&gt;the server side&lt;/a&gt; and &lt;a href="http://www.infoq.com/news/2008/07/introducing-metamodel"&gt;infoq&lt;/a&gt;. It's been great to get the message out and it's also sparked a lot of great ideas from users/evaluators on the discussion forum. A couple of them have been requests that we build more advanced SELECT items into the query model. In this post I'm going to discuss type-casting and extraction functions and how they can be made possible using the new &lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/dialects/IQueryRewriter.html"&gt;IQueryRewriter interface&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;The idea about query rewriting had been going on for some time, of course inspired by Hibernate's dialects. The thing was though, that for a start I wanted to skip dialect handling completely in order to get to know how far one could actually go without having to do any "hacking" in SQL. It worked out quite well but now that we need to incorporate more advanced, non-standardised features, we will of course need to be able to manipulate with the standard output. This is what the query rewriter is for, and in particular the &lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/dialects/AbstractQueryRewriter.html"&gt;AbstractQueryRewriter&lt;/a&gt; helps you do. I've made my &lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/dialects/MysqlQueryRewriter.html"&gt;first query rewriting "hack"&lt;/a&gt; today - using the TOP function for limiting the result set size, which is (as far as I know) only available in MySQL.&lt;br /&gt;&lt;br /&gt;What we need to do now is expand the Query model API. We need to incorporate type casting. My thoughts are:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;We must take an interface-first approach - how would one most appropriately like to type-cast a select item in a query? I'm thinking that we should add a "castAs(&lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/schema/ColumnType.html"&gt;ColumnType&lt;/a&gt; type)" method on &lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/query/SelectItem.html"&gt;SelectItem&lt;/a&gt;.&lt;/li&gt;&lt;li&gt;Because not all of the &lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/schema/ColumnType.html"&gt;ColumnType&lt;/a&gt;'s are supported by all databases we should consider making a more abstract type enum. Something that will only contain a couple of more basic types like String, Integer, Decimal, Date, Boolean.&lt;/li&gt;&lt;li&gt;We should use the query rewriting approach to generate the actual SQL cast syntax. Some databases use the CAST(x AS y) function, others use special-purpose functions like TO_NUMBER(x).&lt;/li&gt;&lt;/ul&gt;Another feature that I want to include in MetaModel is functions for calculating or extracting something on behalf of a column. Let's take for example the YEAR(x) function (or in some databases the EXTRACT(YEAR FROM x) function).&lt;br /&gt;&lt;ul&gt;&lt;li&gt;One would initially just think that we should add this function to the FunctionType enum and then take it from there. But actually it's quite a different type of function. While SUM, COUNT etc. are aggregate functions, the YEAR function is a single-value function, ie you can't call YEAR on a set of values.&lt;/li&gt;&lt;li&gt;Therefore we should consider a rename of &lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/query/FunctionType.html"&gt;FunctionType&lt;/a&gt; to AggregateFunction and create a new enum, CalculationFunction (or maybe we can come up with a better name?)&lt;/li&gt;&lt;li&gt;We can use the same approach as before (query rewriting) to handle different dialects, but we need to make sure that we pick function names that are widely accepted and understandable to the user. Personally I prefer YEAR(x) over EXTRACT(YEAR FROM x) as the syntax is clearer and there are no constants inside the parameter, which is more java-ish. The downside is that we will then also need a MONTH(x), DAY(x) etc. function but that's not a biggie I think.&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;One last note - we should also consider if it's reasonable to keep using enums. Maybe we should switch to interfaces (and constants in the interface to ensure no API changes) for the sake of extensibility.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3317800987764894384?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3317800987764894384/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3317800987764894384' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3317800987764894384'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3317800987764894384'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/08/considering-metamodel-functions.html' title='Considering MetaModel functions'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-79304606683474551</id><published>2008-07-27T21:36:00.002+02:00</published><updated>2008-07-27T21:40:17.307+02:00</updated><title type='text'>DataCleaner 1.3 and MetaModel 1.0.3 released</title><content type='html'>Today is the day that we've released &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt; 1.3 and &lt;a href="http://www.eobjects.dk/metamodel"&gt;MetaModel&lt;/a&gt; 1.0.3. Here's a summary of changes:&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;DataCleaner 1.3&lt;/span&gt;&lt;br /&gt;This release contains many new features and improvements.&lt;br /&gt;&lt;ul&gt;&lt;li&gt;You can save and load work in the Profiler and Validator.&lt;/li&gt;&lt;li&gt;Support for working with data quality in XML files.&lt;/li&gt;&lt;li&gt;Functionality to swap/dice columns and rows in profiler result tables.&lt;/li&gt;&lt;li&gt;A more modular and plugin-friendly UI.&lt;/li&gt;&lt;li&gt;Various bugfixes, optimizations, and wider database support.&lt;/li&gt;&lt;/ul&gt;&lt;span style="font-weight: bold;"&gt;MetaModel 1.0.3&lt;/span&gt;&lt;br /&gt;A minor changes release. Only new feature is the capability to parse and map XML files to the schema-based model. Also the release marks some improvements to documentation and minor bugfixes.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-79304606683474551?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/79304606683474551/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=79304606683474551' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/79304606683474551'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/79304606683474551'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/07/datacleaner-13-and-metamodel-103.html' title='DataCleaner 1.3 and MetaModel 1.0.3 released'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-837517567920589226</id><published>2008-06-27T10:13:00.006+02:00</published><updated>2008-06-28T21:41:43.503+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='notes'/><category scheme='http://www.blogger.com/atom/ns#' term='open'/><category scheme='http://www.blogger.com/atom/ns#' term='meeting'/><category scheme='http://www.blogger.com/atom/ns#' term='source'/><category scheme='http://www.blogger.com/atom/ns#' term='recommendation'/><category scheme='http://www.blogger.com/atom/ns#' term='intelligence'/><category scheme='http://www.blogger.com/atom/ns#' term='presentation'/><category scheme='http://www.blogger.com/atom/ns#' term='business'/><title type='text'>What to remember when presenting OS BI</title><content type='html'>As I wrote about in my previous blog post I went to &lt;a href="http://www.dansk-it.dk/"&gt;Danish IT&lt;/a&gt; the other day to talk about Open Source BI. I've spent a lot of time the last two days contemplating on the presentation and how the audience percievede OS BI as pretty immature and insecure... Somehow some of my points about how one could take advantage of Open Source instead of seeing it as a threat and a risk got lost in the mix. I will try and sum up some of the thoughts that I've had on what went wrong and how to present BI products for people who a unfamiliar with Open Source.&lt;br /&gt;&lt;ul&gt;&lt;li&gt;First of the presentation wasn't really a "sales meeting", so I took on the academic perspective and showed the audience a broad view of the OS BI arena, including two alternatives for each product group; databases, ETL, reporting and OLAP. This was a really bad idea: Presenting the alternatives within OS was simply too much; instead of being impressed that there was a volume for rivalry between and within the OS communities this was thought of as a bad thing - fragmentation, instability etc. So instead, just show the business people a single suite of products, a silver bullet, even though we all know that this does not exist (neither does it in the commercial world, which is a good thing).&lt;/li&gt;&lt;li&gt;Taking a too feature-driven product-focus was not the best idea. Sure you should point out all the good features of OS BI products but for my demonstrations I focused on the great computational power and advanced features instead of showing them some nice user interfaces. In selecting what to present I would definately recommend to use more user friendly products like &lt;a href="http://www.eclipse.org/birt"&gt;eclipse BIRT&lt;/a&gt;, &lt;a href="http://www.talend.com/"&gt;Talend&lt;/a&gt;, &lt;a href="http://www.openlaszlo.org/"&gt;OpenLaszlo&lt;/a&gt;, etc. Instead I showed them the Pentaho suite which for a large part has a somewhat boring theme.&lt;/li&gt;&lt;li&gt;Remember to get them a list of companies that have already undergone OS BI initiatives. I forgot this completely and it was a big mistake. I hope for the attendees that they folllowed my advise to go check it out themselves at &lt;a href="http://www.pentaho.com/about/customers/"&gt;Pentaho&lt;/a&gt;'s and &lt;a href="http://www.jaspersoft.com/cu_overview.html"&gt;JasperSoft&lt;/a&gt;'s websites.&lt;/li&gt;&lt;li&gt;Stress that the participation thing is not something that nescesarily requires coding-skills. Show them how support forums work and that the communication part is just as important. Without useful information the developers are lost and will probably not focus on the exact same thing as the customers do.&lt;/li&gt;&lt;li&gt;Get more authority into the room. Hard to admit I have a hard time getting authority in a room of business people because I'm more of the academic type. So bring along colleagues and trustees to help convince the audience that your message is legitimate. Also this will help potential customers understand that you're not the only one in the business who cares about OS BI - it will tell them that there are others and certainly enough to get started with consulting, training and recruiting.&lt;/li&gt;&lt;li&gt;Show them the numbers of Open Source. &lt;a href="http://www.dwheeler.com/oss_fs_why.html"&gt;David Wheelers article&lt;/a&gt; should give you some good starting points. Address the fact that there are plenty of developers and tell them what motivates them (these points only seemed to kick in when I told them about my &lt;a href="http://www.eobjects.dk/trac"&gt;own OS projects&lt;/a&gt; so I must have not made it clear from the beginning)... Learning, reputation, ideology and "real" work/sponsored developers.&lt;br /&gt;&lt;/li&gt;&lt;/ul&gt;On the positive note however (seeing that the points above may leave you with the impression that the meeting went all wrong, which is not the case) there was several positive experiences. It helped a lot to show the actual community webpages and show how the development process worked. I did this as my last topic and it should have been in there earlier to help the audience get a feeling of the underlying ideas. Also, supplementing with notes on related Open Source products helps people understand that this is not just a "crazy idea" that popped up for BI. Show them JBoss, Apache, Linux, OpenOffice, Eclipse, Mozilla etc. And then tell them about integration, SOA etc. which are architectual challenges that are very suitably overcome using Open Source based solutions.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-837517567920589226?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/837517567920589226/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=837517567920589226' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/837517567920589226'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/837517567920589226'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/06/what-to-remember-when-presenting-os-bi.html' title='What to remember when presenting OS BI'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-775159431895933554</id><published>2008-06-19T21:55:00.005+02:00</published><updated>2008-06-25T16:52:49.566+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open'/><category scheme='http://www.blogger.com/atom/ns#' term='it'/><category scheme='http://www.blogger.com/atom/ns#' term='source'/><category scheme='http://www.blogger.com/atom/ns#' term='keynote'/><category scheme='http://www.blogger.com/atom/ns#' term='dansk'/><category scheme='http://www.blogger.com/atom/ns#' term='intelligence'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='business'/><title type='text'>Addressing Open Source BI and data quality at Danish IT</title><content type='html'>I've been offered to address the Danish IT (Dansk IT) &lt;a href="http://www.dansk-it.dk/netvaerk/4_typer_netvaerk/kompetence_netvaerk/Business_Intelligence.aspx"&gt;networking group on Business Intelligence &lt;/a&gt;next wednesday. I'll be concentrating on talking about Open Source business models, pitfalls and opportunities and an overview of the Open Source BI market including demos of various tools - including &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt; of course. I'm thinking this is a great opportunity to get people involved with OS BI - an area that has been somewhat overlooked, at least in Denmark, I think.&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;Update: &lt;/span&gt;Just got home from the networking group and it was a very interesting day sparked with lots of discussions and perspectives on BI products. I can't say that everybody was convinced with going Open Source for their BI solutions but they definitely got an impression of what goes on and most people there was very interested in &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt;, perhaps because of the few implications it has on the rest of the BI portfolio to apply our data quality solution.&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight: bold;"&gt;Update:&lt;/span&gt; You can now download &lt;a href="http://www.eobjects.dk/resources/download/DanskIt.pdf"&gt;my slides about Open Source Business Intelligence&lt;/a&gt; and please let me know what you think.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-775159431895933554?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/775159431895933554/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=775159431895933554' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/775159431895933554'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/775159431895933554'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/06/addressing-open-source-bi-and-data.html' title='Addressing Open Source BI and data quality at Danish IT'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4759082585218289077</id><published>2008-06-15T09:33:00.003+02:00</published><updated>2008-06-15T09:47:26.691+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='open'/><category scheme='http://www.blogger.com/atom/ns#' term='udvikling'/><category scheme='http://www.blogger.com/atom/ns#' term='source'/><category scheme='http://www.blogger.com/atom/ns#' term='styring'/><category scheme='http://www.blogger.com/atom/ns#' term='report'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='afløsningsopgave'/><title type='text'>Report on DataCleaner development process</title><content type='html'>As some of you may know &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt; started as an academic project for me, investigating how Open Source projects are established, managed and developed. I've been waiting for a long time for the evaluation of the project but yesterday I finally got the results, and I'm proud to announce that I got the top-grade for the assignment! (12 in the danish grading system, which spans from -2 to 12 - I'll have to blog about the grading system some time, it's hilarious).&lt;br /&gt;&lt;br /&gt;The project received notable credits for the explorative style of development and this is something that I'm very proud to keep on practicing. I'm publishing the report for free download here, but I'm affraid it's in Danish, so if you don't understand it you'll have to ... learn Danish ;-)&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Download '&lt;a href="http://www.eobjects.dk/resources/download/afloesningsopgave.pdf"&gt;Udvikling og styring af Open Source projekter&lt;/a&gt;' here (in Danish)&lt;/li&gt;&lt;/ul&gt;I would appreciate any kind of feedback on my research and I don't mind critics now that I have the acknowledgment of Copenhagen Business School, heh.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4759082585218289077?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4759082585218289077/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4759082585218289077' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4759082585218289077'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4759082585218289077'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/06/report-on-datacleaner-development.html' title='Report on DataCleaner development process'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3305983349897097485</id><published>2008-06-14T22:49:00.002+02:00</published><updated>2008-06-14T22:54:17.483+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='fluent'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='metadata'/><category scheme='http://www.blogger.com/atom/ns#' term='interface'/><title type='text'>Fluent interfaces in MetaModel</title><content type='html'>Just spent a couple of hours making the Query class of &lt;a href="http://www.eobjects.dk/metamodel"&gt;MetaModel&lt;/a&gt; implement &lt;a href="http://en.wikipedia.org/wiki/Fluent_interface"&gt;fluent interfaces&lt;/a&gt;. Damn this syntax looks great:&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;new &lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;().select(myColumn).selectCount().from(myTable).where(myColumn, OperatorType.EQUALS_TO, "foobar").groupBy(anotherColumn).orderBy(myColumn);&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;This was one of the last TODO's with MetaModel in this round so I think we're going to release version 1.0 pretty soon! Stay tuned.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3305983349897097485?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3305983349897097485/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3305983349897097485' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3305983349897097485'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3305983349897097485'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/06/fluent-interfaces-in-metamodel.html' title='Fluent interfaces in MetaModel'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2154860217112567638</id><published>2008-06-11T23:02:00.005+02:00</published><updated>2008-12-09T00:56:41.575+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='look'/><category scheme='http://www.blogger.com/atom/ns#' term='great'/><category scheme='http://www.blogger.com/atom/ns#' term='user interface'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='screenshot'/><title type='text'>DataCleaner looks...</title><content type='html'>Been working a couple of days on a new great look for &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt;, check it out:&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_UpvxZrigQfQ/SFA9hkbtBHI/AAAAAAAAAAo/Rr4vyys3GiA/s1600-h/dc_1.2_snapshot_a.png"&gt;&lt;img style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer;" src="http://2.bp.blogspot.com/_UpvxZrigQfQ/SFA9hkbtBHI/AAAAAAAAAAo/Rr4vyys3GiA/s320/dc_1.2_snapshot_a.png" alt="" id="BLOGGER_PHOTO_ID_5210732415792514162" border="0" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;I'm starting to get real excited about releasing version 1.2, it'll be a radical improvement in terms of both visual experience and functionality I think!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-2154860217112567638?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2154860217112567638/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=2154860217112567638' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2154860217112567638'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/2154860217112567638'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/06/datacleaner-looks.html' title='DataCleaner looks...'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_UpvxZrigQfQ/SFA9hkbtBHI/AAAAAAAAAAo/Rr4vyys3GiA/s72-c/dc_1.2_snapshot_a.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5698842720988352939</id><published>2008-06-01T23:10:00.001+02:00</published><updated>2008-06-01T23:11:33.953+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='rows'/><category scheme='http://www.blogger.com/atom/ns#' term='millions'/><category scheme='http://www.blogger.com/atom/ns#' term='large'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='huge'/><category scheme='http://www.blogger.com/atom/ns#' term='jdbc'/><category scheme='http://www.blogger.com/atom/ns#' term='resultset'/><title type='text'>How to process millions of resultset rows in java</title><content type='html'>I'm so excited, since I just think that we've solved a very common problem in java applications that have to deal with huge amounts of data. Here's the trouble:&lt;br /&gt;&lt;ol&gt;&lt;li&gt;Even though the JDBC spec. defines a way to specify the fetch size when executing queries, some drivers do not implement this feature, which means your program will run out of memory if you query eg. a couple of millions of records.&lt;/li&gt;&lt;li&gt;Even if your driver works as it is supposed to (that would be a reasonable assumption in most cases) there's still no effective way to optimize the computation of the many records by multithreading since the data is streamed through a single connection.&lt;br /&gt;&lt;/li&gt;&lt;/ol&gt;Because of the power of the &lt;a href="http://www.eobjects.dk/metamodel"&gt;MetaModel&lt;/a&gt; schema and query model we've been able to create a generic mechanism for splitting up a query into other queries that will yield less rows but the same collective result. The way we do this is by identifying attributes that can be used to filter in WHERE clauses, for example:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Consider we want to split up the query: "SELECT name, email FROM persons"&lt;/li&gt;&lt;li&gt;We will investigate the &lt;span style="font-style: italic;"&gt;persons&lt;/span&gt; table and find columns that can be used to split the total resultset. We might find a reasonable &lt;span style="font-style: italic;"&gt;age&lt;/span&gt;-column for this, so the query could be split to:&lt;/li&gt;&lt;/ul&gt;&lt;ol&gt;&lt;li&gt;SELECT name, email FROM persons WHERE age &lt; 30 OR age IS NULL&lt;br /&gt;&lt;/li&gt;&lt;li&gt;SELECT name, email FROM persons WHERE age &gt; 30 OR age = 30&lt;/li&gt;&lt;/ol&gt;Depending on the desired size of the partial queries we will split up by finding further columns or by defining finer intervals to split by. Here's how it works:&lt;br /&gt;&lt;blockquote&gt;&lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/DataContext.html"&gt;DataContext&lt;/a&gt; dc = ...&lt;br /&gt;&lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt; q = ...&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/QuerySplitter.html"&gt;QuerySplitter&lt;/a&gt; qs = new &lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/QuerySplitter.html"&gt;QuerySplitter&lt;/a&gt;(dc, q);&lt;br /&gt;List&lt;&lt;a href="http://www.eobjects.dk/hudson/job/MetaModel/ws/trunk/target/site/apidocs/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;&gt; queries = qs.splitQueries();&lt;br /&gt;&lt;/blockquote&gt;I'd love to know what you all think of this? Personally I think it's a lovely way to optimize memory consumption and it offers new ways to utilize grid computing by distributing partial queries to diffent nodes in the grid to do remote processing. Also a lot of databases (MySQL for example) only dedicates a single thread per query - so by splitting the queries one could further optimize multithreading on the database.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5698842720988352939?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5698842720988352939/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5698842720988352939' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5698842720988352939'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5698842720988352939'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/05/how-to-process-millions-of-resultset.html' title='How to process millions of resultset rows in java'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1832397743366733526</id><published>2008-05-15T23:03:00.003+02:00</published><updated>2008-05-15T23:07:57.260+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='choise'/><category scheme='http://www.blogger.com/atom/ns#' term='awards'/><category scheme='http://www.blogger.com/atom/ns#' term='community'/><category scheme='http://www.blogger.com/atom/ns#' term='sourceforge'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Community choise awards</title><content type='html'>&lt;a href="http://sourceforge.net/awards/cca/?group_id=217469"&gt;&lt;img src="http://sourceforge.net/awards/cca/badge_img.php?group_id=217469&amp;style=3" border="0" style="float: right;" /&gt;&lt;/a&gt;&lt;br /&gt;I just signed &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt; up for sourceforge's &lt;a href="http://sourceforge.net/community/cca08"&gt;community choise awards 08&lt;/a&gt; :) I definately don't count on us winning anything though... Not because we don't deserve it (we do) but because noone knows about us (yet). Anyhoot, you can all help change that - go nominate us for "best new project" and "best project for the enterprise"!&lt;br /&gt;&lt;br /&gt;Click the banner to the right to give us your vote!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1832397743366733526?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1832397743366733526/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1832397743366733526' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1832397743366733526'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1832397743366733526'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/05/community-choise-awards.html' title='Community choise awards'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3187200610330110677</id><published>2008-05-14T21:10:00.008+02:00</published><updated>2008-12-09T00:56:41.688+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='result'/><category scheme='http://www.blogger.com/atom/ns#' term='explore'/><category scheme='http://www.blogger.com/atom/ns#' term='drill'/><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='detail'/><category scheme='http://www.blogger.com/atom/ns#' term='profile'/><category scheme='http://www.blogger.com/atom/ns#' term='explode'/><category scheme='http://www.blogger.com/atom/ns#' term='to'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Drill-to-detail in profiling results</title><content type='html'>I've just added a &lt;a href="http://www.eobjects.dk/trac/ticket/72"&gt;new feature&lt;/a&gt; to &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt; that I'm very happy with. Now when you see a profiling result you can interact with it and drill to details on measures of interest to gain insight into how those measures are composed. Consider the screenshot below (sorry for the ugly window borders, I'm running linux here and we're still trying to &lt;a href="http://www.eobjects.dk/trac/ticket/100"&gt;tweak the UI&lt;/a&gt; to look sweet with the &lt;a href="http://www.jgoodies.com/"&gt;jgoodies&lt;/a&gt; look'n'feel).&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://4.bp.blogspot.com/_UpvxZrigQfQ/SCs5Y4I7u5I/AAAAAAAAAAM/4Ok82nRRs8E/s1600-h/Drill-to-detail.png"&gt;&lt;img style="margin: 0px auto 10px; display: block; text-align: center; cursor: pointer;" src="http://4.bp.blogspot.com/_UpvxZrigQfQ/SCs5Y4I7u5I/AAAAAAAAAAM/4Ok82nRRs8E/s400/Drill-to-detail.png" alt="" id="BLOGGER_PHOTO_ID_5200313294278867858" border="0" /&gt;&lt;/a&gt;What you're seeing is that the &lt;a href="http://www.eobjects.dk/trac/wiki/DataCleanerPatternFinder"&gt;pattern finder&lt;/a&gt; identified a couple of patterns for the ADDRESSLINE2 column and I've just clicked the "aaaaa 999" pattern to find out which value where categorized to fit this pattern. This queries (drills to details) the datastore for these columns. Similarly I could have wanted to find out which values yielded the "max chars", "min chars" or other measures in the profile result matrix above. If I do that it will similarly query the values that originated the measures.&lt;br /&gt;&lt;br /&gt;So how is this possible? Thanks to &lt;a href="http://www.eobjects.dk/metamodel"&gt;MetaModel&lt;/a&gt; it's quite easy to model such data explorations! Even though some of these queries cannot be transformed into SQL it is possible to post-process them using row filters that filter out only the correct rows based on an initial query. Let's look at an example, how to drill to details on the "max words" measure in &lt;a href="http://www.eobjects.dk/trac/wiki/DataCleanerStringAnalysis"&gt;String analysis&lt;/a&gt;:&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;&lt;span style="color: green;"&gt;//For each column result...&lt;/span&gt;&lt;br /&gt;MatrixValue mv = matrixValues[7]; &lt;span style="color: green;"&gt;// 7 is the index of the "max words" measure&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;&lt;span style="color: green;"&gt;//Construct a query that get's as close as possible to the desired result&lt;/span&gt;&lt;br /&gt;Query q = new Query();&lt;br /&gt;q.addSelect(column);&lt;br /&gt;q.addFrom(column.getTable())&lt;br /&gt;q.addGroupBy(column).addSelect(new SelectItem(FunctionType.COUNT,"*",null));&lt;br /&gt;&lt;br /&gt;&lt;span style="color: green;"&gt;//Set the query as the source of the details&lt;/span&gt;&lt;br /&gt;mv.setDetailSource(q);&lt;br /&gt;&lt;br /&gt;&lt;span style="color: green;"&gt;//Set a post processing row filter to include only those&lt;br /&gt;//rows that have the right number of words&lt;br /&gt;//(something that cannot be expressed in a query)&lt;/span&gt;&lt;br /&gt;mv.addDetailRowFilter(new IRowFilter() {&lt;br /&gt; &amp;nbsp; public boolean accept(Row row) {&lt;br /&gt; &amp;nbsp; &amp;nbsp; &amp;nbsp; Object value = row.getValue(column);&lt;br /&gt; &amp;nbsp; &amp;nbsp; &amp;nbsp; if (value != null &amp;&amp; &lt;br /&gt; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; new StringTokenizer(value.toString()).countTokens() == numWords) {&lt;br /&gt; &amp;nbsp; &amp;nbsp; &amp;nbsp;  &amp;nbsp; &amp;nbsp; &amp;nbsp; return true;&lt;br /&gt; &amp;nbsp; &amp;nbsp; &amp;nbsp; }&lt;br /&gt; &amp;nbsp; &amp;nbsp; &amp;nbsp; return false;&lt;br /&gt; &amp;nbsp; }&lt;br /&gt;});&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;&lt;p&gt;The code above is a slight rewrite of the &lt;a href="http://www.eobjects.dk/trac/browser/datacleaner/DataCleaner-core/trunk/src/main/java/dk/eobjects/datacleaner/profiler/trivial/StringAnalysisProfile.java"&gt;real code for String analysis&lt;/a&gt; and it shows how easy it is (and ought to be) to attach a query to a value in a profile result! Drill-to-detail features are now a common thing in DataCleaner and in my oppinion it leverages the use of the profiler functionality to a new level.&lt;/p&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-3187200610330110677?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/3187200610330110677/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=3187200610330110677' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3187200610330110677'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/3187200610330110677'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/05/drill-to-detail-in-profiling-results.html' title='Drill-to-detail in profiling results'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_UpvxZrigQfQ/SCs5Y4I7u5I/AAAAAAAAAAM/4Ok82nRRs8E/s72-c/Drill-to-detail.png' height='72' width='72'/><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-4309358428946402859</id><published>2008-05-07T23:49:00.003+02:00</published><updated>2008-05-07T23:52:50.365+02:00</updated><title type='text'>That was easy</title><content type='html'>This week's work...&lt;br /&gt;&lt;a href="http://www.eobjects.dk/metamodel"&gt;MetaModel&lt;/a&gt; &lt;a href="http://www.eobjects.dk/trac/ticket/126"&gt;is now a part of&lt;/a&gt; &lt;a href="http://www.eobjects.dk/datacleaner"&gt;the DataCleaner stack&lt;/a&gt;!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-4309358428946402859?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/4309358428946402859/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=4309358428946402859' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4309358428946402859'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/4309358428946402859'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/05/that-was-easy.html' title='That was easy'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1748983370230858287</id><published>2008-05-03T12:25:00.012+02:00</published><updated>2008-11-14T17:27:45.292+01:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='csv'/><category scheme='http://www.blogger.com/atom/ns#' term='mutable'/><category scheme='http://www.blogger.com/atom/ns#' term='schema'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='metamodel'/><category scheme='http://www.blogger.com/atom/ns#' term='excel'/><category scheme='http://www.blogger.com/atom/ns#' term='column'/><category scheme='http://www.blogger.com/atom/ns#' term='eobjects'/><category scheme='http://www.blogger.com/atom/ns#' term='table'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><category scheme='http://www.blogger.com/atom/ns#' term='jdbc'/><title type='text'>Demonstrating the mutable query</title><content type='html'>Having wanted for a long time to be able to manipulate queries in an object-oriented manner is not something that I think has been just my personal ambition, but a strong barrier for a lot of data-centric applications out there. Considering the many dialects of SQL it is also striking how few attempts there has been to unify these dialects using a common meta-layer. Sure there are very succesfull (and good!) ORM frameworks like &lt;a href="http://www.hibernate.org/"&gt;Hibernate&lt;/a&gt; and &lt;a href="http://java.sun.com/jdo/"&gt;JDO&lt;/a&gt; that have built in dialect-translation but when you think about it they actually apply to a very limited scenario where you &lt;span style="font-style: italic;"&gt;know in advance &lt;/span&gt;how the database is structured and what objects you want to store the content of the database. Consider instead a model of a database that is not a &lt;span style="font-style: italic;"&gt;mapping to domain objects&lt;/span&gt;, but more an object-oriented wrapper for the database since the database &lt;span style="font-style: italic;"&gt;is &lt;/span&gt;the domain. The advantages of an object-oriented domain model for the database is obvious, here are a couple of pointers;&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Traversing a database (schemas, tables, columns, relationships etc.) becomes as easy as it is to traverse any other domain model.&lt;/li&gt;&lt;li&gt;Building and manipulating queries can be a continous and type-safe process. This gives us the opportunity to do meaningful fault-tolerant query-modifications in order to recover from database-insufficiencies and to enterpret the query not only as SQL, but as a generic query built for non-SQL datastores.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;It is possible now to mimic a database with other data technologies, for examples flat/csv files, excel spreadsheets, web services or hardcoded datastores.&lt;/li&gt;&lt;li&gt;Building systems to support the database structure (for example metadata applications) becomes possible without extensive JDBC hacking.&lt;/li&gt;&lt;/ul&gt;Anyhoot, look no further, because we have invested considerable time into these issues by developing &lt;a href="http://eobjects.org/metamodel"&gt;eobjects MetaModel&lt;/a&gt;. Let me show you how you build queries using the MetaModel framework in a totally type-safe manner!&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;java.sql.Connection connection = ... //Obtain a JDBC connection&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/DataContext.html"&gt;DataContext&lt;/a&gt; dataContext = &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/DataContextFactory.html"&gt;DataContextFactory&lt;/a&gt;.createJdbcDataContext(connection);&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/schema/Schema.html"&gt;Schema&lt;/a&gt;[] schemas = dataContext.getSchemas();&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/schema/Table.html"&gt;Table&lt;/a&gt; customerTable = schemas[0].getTableByName("customers");&lt;br /&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt; q = new &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;();&lt;br /&gt;q.addFrom(customerTable);&lt;br /&gt;q.addSelect(customerTable.getColumns());&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/data/DataSet.html"&gt;DataSet&lt;/a&gt; dataSet = dataContext.executeQuery(q);&lt;br /&gt;while (dataSet.next()) {&lt;br /&gt;&amp;nbsp;  &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/data/Row.html"&gt;Row&lt;/a&gt;   row = dataSet.getRow();&lt;br /&gt;&amp;nbsp;  System.out.println(row.toString());&lt;br /&gt;}&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Of course in this example a couple of hardcoded values are entered such as the "customers" literal, but I could have just as well traversed the schema by going through every table without any hard references.&lt;br /&gt;&lt;br /&gt;So I guess the most striking question is: Doesn't it cause some ineffeciencies to generalize queries like this? And can I still execute my custom SQL-string query? The answer is "no, because that goes against the whole idea of MetaModel". You shouldn't be able to do anything that is not portable to other datastores. Then you could just as well do it "the old" JDBC way and of course we don't prohibit you from doing so. But let's instead &lt;a href="http://eobjects.org/metamodel/apidocs/current"&gt;take a look&lt;/a&gt; at how to do regular (portable and standards-compliant) SQL queries in MetaModel:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt; q = new &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/Query.html"&gt;Query&lt;/a&gt;();&lt;br /&gt;q.addFrom(new &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/FromItem.html"&gt;FromItem&lt;/a&gt;(schema.getTable("products")).setAlias("p");&lt;br /&gt;q.addSelect(schema.getTable("products").getColumn("product_type"));&lt;br /&gt;q.addGroupBy(schema.getTable("products").getColumn("product_type"));&lt;br /&gt;&lt;br /&gt;System.out.println(q.toString());&lt;br /&gt;//yields: "SELECT p.product_type FROM products p GROUP BY p.productType"&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Notice here that there are multiple overloaded addFrom() methods in Query. In the first example we added a table as the argument, in this next example we used a FromItem and gave it an alias. The same principle applies to the other parts of the query, lets see it by adding an aggregate function to our query:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/schema/Column.html"&gt;Column&lt;/a&gt; priceColumn = schema.getTable("products").getColumn("price");&lt;br /&gt;q.addSelect(new &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/SelectItem.html"&gt;SelectItem&lt;/a&gt;(&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/FunctionType.html"&gt;FunctionType&lt;/a&gt;.SUM, priceColumn);&lt;br /&gt;System.out.println(q.toString());&lt;br /&gt;//yields: "SELECT p.product_type, SUM(p.price) FROM products p GROUP BY p.productType"&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Okay, it seems pretty straight forward eh? That's because it &lt;i&gt;is&lt;/i&gt;. So get started and use MetaModel. Here's some more examples for ya so you can see how easy it is to build even complex queries:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;&lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/schema/Column.html"&gt;Column&lt;/a&gt; marketYearColumn = schema.getTable("products").getColumn("market_year");&lt;br /&gt;SelectItem yearSelect = new SelectItem(marketYearColumn);&lt;br /&gt;&lt;br /&gt;q.addWhere(new &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/FilterItem.html"&gt;FilterItem&lt;/a&gt;(yearSelect , &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/OperatorType.html"&gt;OperatorType&lt;/a&gt;.HIGHER_THAN, 2004));&lt;br /&gt;System.out.println(q.toString());&lt;br /&gt;//yields: "SELECT p.product_type, SUM(p.price) FROM products p WHERE p.market_year &gt; 2004 GROUP BY p.productType"&lt;br /&gt;&lt;br /&gt;q.addOrderBy(new &lt;a href="http://eobjects.org/metamodel/apidocs/current/dk/eobjects/metamodel/query/OrderByItem.html"&gt;OrderByItem&lt;/a&gt;(q.getSelectClause().getItem(0),false));&lt;br /&gt;System.out.println(q.toString());&lt;br /&gt;//yields: "SELECT p.product_type, SUM(p.price) FROM products p WHERE p.market_year &gt; 2004 GROUP BY p.productType ORDER BY p.product_type"&lt;br /&gt;&lt;/blockquote&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1748983370230858287?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1748983370230858287/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1748983370230858287' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1748983370230858287'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1748983370230858287'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/05/demonstrating-mutable-query.html' title='Demonstrating the mutable query'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-1694586422958696516</id><published>2008-04-23T16:16:00.004+02:00</published><updated>2008-04-23T16:36:00.834+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='domain'/><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='open'/><category scheme='http://www.blogger.com/atom/ns#' term='source'/><category scheme='http://www.blogger.com/atom/ns#' term='model'/><category scheme='http://www.blogger.com/atom/ns#' term='meta'/><title type='text'>A meta domain model</title><content type='html'>In the field of object-orientation it has for some years been common to talk about modelling the domain; to create domain models. This makes perfect sense if you're a car dealer or a pet shop, ie. a business with a specific domain. But what about those development projects that does not apply to a specific domain, but to all kinds of domains?&lt;br /&gt;&lt;br /&gt;This was the situation when I decided to start the &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner project&lt;/a&gt; some months ago. What I needed here was a domain model concentrated with data sources, their structures and the data they contained. I searched the Open Source offering on this and to my surprise didn't find any solid attempts to do this. Sure there are object-relational-frameworks like &lt;a href="http://www.hibernate.org/"&gt;Hibernate&lt;/a&gt; which enables you to create your own domain model objects and "easily" map them to a database, but it's not possible to map the database itself to objects that represent the structure of the database.&lt;br /&gt;&lt;br /&gt;Initially I just started making such a model myself - I mean, how hard could it be? The result was the DataCleaner metadata &amp;amp; data layer, which works fine in DataCleaner, but wasn't quite developed for reuse in other applications. So now I've &lt;a href="https://groups.google.com/group/datacleaner/browse_thread/thread/d9f6dd98551e64fe"&gt;started creating a new project&lt;/a&gt;, which I'm calling &lt;a href="http://www.eobjects.dk/"&gt;eobjects.dk&lt;/a&gt; MetaModel. The MetaModel project takes of where the DataCleaner metadata &amp;amp; data layer stops. We have classes like Schema, Table, Column etc. but we will try to remove (encapsulate) any tie to JDBC, because if you want to see a messy API (or more correctly: messy implementations), then look at JDBC. We will also use the MetaModel to take advantage of "new" language constructions in java like enumerations, generics etc.&lt;br /&gt;&lt;br /&gt;Here are some of the plans for MetaModel:&lt;br /&gt;&lt;ul&gt;&lt;li&gt;Schema model: Schema (and unification of the very ambigious Catalog and Schema terminilogy in JDBC), Table, Column, ColumnType (enum), TableType (enum) etc.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;Query model: Query, SelectClause, FromClause, WhereClause, GroupByClause, HavingClause, OrderByClause etc.&lt;br /&gt;&lt;/li&gt;&lt;li&gt;Data model: Dataset (with Streaming and keep-in-memory options), Row&lt;/li&gt;&lt;/ul&gt;My hope for this is to make an API which makes it possible to interact with your database in a type-safe manner and avoid query problems, hardcoded literals in the code etc.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-1694586422958696516?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/1694586422958696516/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=1694586422958696516' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1694586422958696516'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/1694586422958696516'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/04/meta-domain-model.html' title='A meta domain model'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-56783080122350285</id><published>2008-04-19T14:57:00.012+02:00</published><updated>2008-04-19T15:41:52.699+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='query'/><category scheme='http://www.blogger.com/atom/ns#' term='excel'/><category scheme='http://www.blogger.com/atom/ns#' term='open'/><category scheme='http://www.blogger.com/atom/ns#' term='eobjects'/><category scheme='http://www.blogger.com/atom/ns#' term='metadata'/><category scheme='http://www.blogger.com/atom/ns#' term='source'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='jdbc'/><category scheme='http://www.blogger.com/atom/ns#' term='csv'/><category scheme='http://www.blogger.com/atom/ns#' term='cleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='database'/><title type='text'>Querying data with DataCleaner-core</title><content type='html'>I promised the other day that I would return on the topic of using the &lt;a href="http://www.eobjects.dk/trac/wiki/DataCleanerDesign"&gt;metadata &amp;amp; data layer of DataCleaner-core&lt;/a&gt;. So here's what we'll do;&lt;br /&gt;&lt;br /&gt;1) Open up a connection to the database (this is plain old JDBC). Here's how to do it with Derby, but any database could do:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;Class.forName("org.apache.derby.jdbc.EmbeddedDriver");&lt;br /&gt;Connection con = DriverManager.getConnection("jdbc:derby:my_database;");&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;2) Let's create a schema object for an easier, object-oriented way of accessing the data.&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;JdbcSchemaFactory schemaFactory = new JdbcSchemaFactory();&lt;br /&gt;ISchema[] schemas = schemaFactory.getSchemas(con);&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Note that by default the JDBC schema factory only retrieves relations from the database of type "TABLE". You could in some situations though wish to broaden this restriction, for example to enable views:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;JdbcSchemaFactory schemaFactory = new JdbcSchemaFactory();&lt;br /&gt;schemaFactory.setTableTypes(new String[] {"TABLE","VIEW"});&lt;br /&gt;ISchema[] schemas = schemaFactory.getSchemas(con);&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;3) Let's try exploring our metadata, consisting of &lt;a href="http://www.eobjects.dk/hudson/job/DataCleaner-core/ws/trunk/target/site/apidocs/dk/eobjects/datacleaner/metadata/ISchema.html"&gt;schemas&lt;/a&gt;, &lt;a href="http://www.eobjects.dk/hudson/job/DataCleaner-core/ws/trunk/target/site/apidocs/dk/eobjects/datacleaner/metadata/ITable.html"&gt;tables&lt;/a&gt; and &lt;a href="http://www.eobjects.dk/hudson/job/DataCleaner-core/ws/trunk/target/site/apidocs/dk/eobjects/datacleaner/metadata/IColumn.html"&gt;columns&lt;/a&gt;.&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;ITable productTable = schemas[0].getTableByName("Products");&lt;br /&gt;IColumn[] productColumns = productTable.getColumns();&lt;br /&gt;IColumn productCodeColumn = productTable.getColumnByName("product_code");&lt;br /&gt;&lt;br /&gt;//This next int represents one of the constants in java.sql.Types.&lt;br /&gt;int productCodeType = productCodeColumn.getColumnType();&lt;br /&gt;boolean isProductCodeLiteral = MetadataHelper.isLiteral(productCodeType);&lt;br /&gt;boolean isProductCodeNumber = MetadataHelper.isNumber(productCodeType);&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;4) Time to make a query or two. Let's start off by just querying the whole table and then querying two specific columns.&lt;br /&gt;&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;JdbcDataFactory dataFactory = new JdbcDataFactory();&lt;br /&gt;IData someDataThatWeWillDiscard = dataFactory.getData(con, productTable);&lt;br /&gt;IData data = dataFactory.getData(con, productTable, productCodeColumn, anotherColumn);&lt;br /&gt;while (data.next()) {&lt;br /&gt;&lt;blockquote style="margin-top: 2px; margin-bottom: 2px;"&gt;&lt;br /&gt;IRow row = data.getRow();&lt;br /&gt;int count = data.getCount();&lt;br /&gt;System.out.println("Observed " + count + " rows with product code: " + row.getValue(productCodeColumn));&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;}&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;&lt;br /&gt;Notice the &lt;a href="http://www.eobjects.dk/hudson/job/DataCleaner-core/ws/trunk/target/site/apidocs/dk/eobjects/datacleaner/metadata/IData.html"&gt;IData&lt;/a&gt;.getCount() which is crucial to understand. The data factory will try to generate a group by query to optimize the load on traffic between database server and client. Sometimes this is not possible though (for example for TEXT types in Derby, where GROUP BY is not allowed). The getCount() method returns how many occurances there are of this distinct combination of values, represented by the &lt;a href="http://www.eobjects.dk/hudson/job/DataCleaner-core/ws/trunk/target/site/apidocs/dk/eobjects/datacleaner/metadata/IRow.html"&gt;IRow&lt;/a&gt; interface. So make sure to always check the count, maybe there are less rows in the result than in the actual database, because the results have been compressed!&lt;br /&gt;&lt;br /&gt;Observe in general how strongly typed an API this is. In other data-oriented API's one would have to type in the same column name several places (at least in the query and when iterating through the results) but with the DataCleaner-core metadata and data layer we get a completely object oriented and type safe way to do this. The amazing thing about this API is also that we could have just as well done the same thing with &lt;a href="http://kaspersorensen.blogspot.com/2008/04/traversing-schemas-with-datacleaner.html"&gt;flat files or other data source types&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-56783080122350285?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/56783080122350285/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=56783080122350285' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/56783080122350285'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/56783080122350285'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/04/querying-data-with-datacleaner-core.html' title='Querying data with DataCleaner-core'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-7265256971798682063</id><published>2008-04-18T12:13:00.009+02:00</published><updated>2008-08-05T10:57:29.495+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='screen'/><category scheme='http://www.blogger.com/atom/ns#' term='headless'/><category scheme='http://www.blogger.com/atom/ns#' term='linux'/><category scheme='http://www.blogger.com/atom/ns#' term='swing'/><category scheme='http://www.blogger.com/atom/ns#' term='cleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='display'/><category scheme='http://www.blogger.com/atom/ns#' term='test'/><category scheme='http://www.blogger.com/atom/ns#' term='hudson'/><category scheme='http://www.blogger.com/atom/ns#' term='uispec4j'/><category scheme='http://www.blogger.com/atom/ns#' term='datacleaner'/><title type='text'>Getting uispec4j to run in Hudson on a headless linuxbox</title><content type='html'>I've been fiddling around with this problem for some time now and I finally got all the pieces together so I guess I'd better share my newfound knowledge on these obscure topics that I hope I'll never have to encounter again.&lt;br /&gt;&lt;br /&gt;It all started with a new fine and dandy testframework called &lt;a href="http://www.uispec4j.org/"&gt;uispec4j&lt;/a&gt;, that we wanted to use for &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner&lt;/a&gt;s GUI. Uispec4j is supposedly "Java GUI testing made simple" and so they caught our attention because the code coverage of DataCleaner GUI was not &lt;a href="http://www.eobjects.dk/hudson/job/DataCleaner-gui/ws/trunk/target/site/cobertura/index.html"&gt;that impressive&lt;/a&gt; (yet, if you read this blog post and a lot of time has passed, it may hopefully be looking better).&lt;br /&gt;&lt;br /&gt;So we started of by creating some neat unittests for DataCleaner GUI, using uispec4j. Hurray. They worked fine and dandy on our Windows development machines so we uploaded them to the repository and into the Continous Integration loop. This is where hell broke loose.&lt;br /&gt;&lt;br /&gt;First off, our Continous Integration server was headless (ie. no screens, monitors, displays, whatever, just a remote console). Surely this wouldn't do because uispec4j requires a window manager to use for emulating the Java GUI. Fair enough, I installed X with the Xfce window manager:&lt;br /&gt;&lt;blockquote&gt;apt-get install xorg xfce4&lt;/blockquote&gt;&lt;br /&gt;Then came the next problem. When starting X a fatal error occurred, telling me that no screens where installed. That seems fairly reasonable, but what the heck should I do about it? I decided to install a VNC server to host a remote screen. This would hopefully rid me of my troubles, since I didn't have the (physical) room for installing a monitor for the damn thing.&lt;br /&gt;&lt;blockquote&gt;apt-get install vncserver&lt;/blockquote&gt;&lt;br /&gt;After configuring the vncserver I tried running my tests... Next obstacle: Telling Java which screen to use. This required to set the DISPLAY environment variable in /etc/profile:&lt;br /&gt;&lt;blockquote&gt;export DISPLAY=:1&lt;/blockquote&gt;&lt;br /&gt;Now came the time for some mind-bobbling uispec4j errors. I found out that &lt;a href="http://www.uispec4j.org/troubleshooting.html"&gt;uispec4j only works with Motif on linux&lt;/a&gt; so you had to append "-Dawt.toolkit=sun.awt.motif.MToolkit" to your commandline like this:&lt;br /&gt;&lt;blockquote&gt;mvn install -Dawt.toolkit=sun.awt.motif.MToolkit&lt;/blockquote&gt;&lt;br /&gt;every time you need to build the damn thing. Sigh, this wasn't something that my Continous Integration system (&lt;a href="https://hudson.dev.java.net/"&gt;Hudson&lt;/a&gt;) was built for so I started to edit various batchscripts to see if I appended the damn "-Dawt.toolkit=sun.awt.motif.MToolkit" parameter to my containers startup script it would work, but no. Instead I found out that you could set the MAVEN_OPTS environment variable, so I did that in /etc/profile:&lt;br /&gt;&lt;blockquote&gt;export MAVEN_OPTS="-Dawt.toolkit=sun.awt.motif.MToolkit"&lt;/blockquote&gt;&lt;br /&gt;But that didn't work either because Hudson doesn't comply with the damn thing :( I tried to set that "awt.toolkit" system property using some static initializers (which I generally think is a poor, poor, poor thing to do in Java in general), but guess what? &lt;i&gt;Uispec4j is filled with static initializers&lt;/i&gt; as well, so that brought me no guarantees whether or not I was the &lt;i&gt;first&lt;/i&gt; static initializer run. (&lt;span style="font-weight: bold;"&gt;edit:&lt;/span&gt; Apparently I might be wrong in this claim about uispec4j, check out the comments for more details).&lt;br /&gt;&lt;br /&gt;Finally I got a new version of Hudson that had a per-project configuration of MAVEN_OPTS and that did the job. The last issue was actually a JVM issue. I had to change the runtime user of my J2EE container to be the same user that hosts the VNC server instance. If you try to access another users desktop, the JVM turns fatal. So don't touch my desktop or you'll get your fingers burnt!&lt;br /&gt;&lt;br /&gt;Ah and a last thing about GUI testing: Make sure to set the Locale in your junit setUp methods or else the unittests won't be portable between computers if they have different languages and you assert on the labels of UI elements.&lt;br /&gt;&lt;br /&gt;I once heard a very wise colleague and fellow developer say:&lt;br /&gt;&lt;blockquote&gt;"You should test functionality and domain models through unittesting and test UI through UI!"&lt;/blockquote&gt;&lt;br /&gt;...&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-7265256971798682063?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/7265256971798682063/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=7265256971798682063' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7265256971798682063'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/7265256971798682063'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/04/getting-uispec4j-to-run-in-hudson-on.html' title='Getting uispec4j to run in Hudson on a headless linuxbox'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-643490514440646516</id><published>2008-04-17T08:38:00.003+02:00</published><updated>2008-04-17T09:01:29.489+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='dependency'/><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='jar'/><category scheme='http://www.blogger.com/atom/ns#' term='cleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='maven'/><title type='text'>Maven-ing your way around DataCleaner</title><content type='html'>There seems to be quite some frustrations for old ANT-users switching to use Maven so I thought I would make a small post about the main differences and various hacks that are useful to know as a Maven user. The good thing about ANT is that you can always hack your way around a problem and it's quite easy to find the problems that are stopping you. The bad thing is of course that the build-files seem to grow enormously and that you have to enforce your project infrastructure with some kind of common JAR-download area like a FTP or something similar. In contrast Maven focuses not on the build as a process, but more on the content of the build, because in 99% of the times the process of building a java project is pretty much the same, so why not omit the "how" completely and only focus on the "what" of your build? This "what" is configured in the &lt;a href="http://www.eobjects.dk/trac/browser/datacleaner/DataCleaner-core/trunk/pom.xml"&gt;pom.xml&lt;/a&gt; file!&lt;br /&gt;&lt;br /&gt;Admitted, that was not my primary reason for choosing Maven! :) The thing that won me over was of course the dependency handling system which I really love and loathe a bit at the same time. What you need to be aware of about the dependencies is this:&lt;br /&gt;&lt;ul&gt;&lt;br /&gt;&lt;li&gt;Maven automagically creates a local repository for all the JARs you use in your projects.&lt;/li&gt;&lt;br /&gt;&lt;li&gt;There's also a &lt;a href="http://www.mvnrepository.com"&gt;central repository&lt;/a&gt; where maven will download the JARs from, if they are not found in the local one.&lt;/li&gt;&lt;br /&gt;&lt;li&gt;If you are working offline or behind a proxy and you need a new JAR you're bound to mess this up :( When Maven can't find it's JARs in the central repository or locally it will blacklist it!&lt;/li&gt;&lt;br /&gt;&lt;li&gt;You can however delete the blacklisting by removing (part of) the local repository, it is found in ~/.m2/repository...&lt;br/&gt;&lt;br /&gt;windows: &lt;i&gt;C:/Documents and Settings/[username]/.m2/repository&lt;/i&gt;&lt;br/&gt;&lt;br /&gt;or linux: &lt;i&gt;/home/[username]/.m2/repository&lt;/i&gt;).&lt;/li&gt;&lt;br /&gt;&lt;/ul&gt;&lt;br /&gt;OK, so that was the background-knowledge you had to know - now for some of the build goals. The mostly used maven goal is "install", oftenly prefixed with "clean", like this:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;mvn clean install&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;The install goal will build the project, run the unittests, verify that everything worked and then install the resulting JAR/WAR/Whatever into your local repository. This means that you can then use the project as a dependency to another project, smart eh?&lt;br /&gt;And now for some other commonly goals:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;mvn site&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Create a nifty &lt;a href="http://www.eobjects.dk/hudson/job/DataCleaner-core/ws/trunk/target/site/index.html"&gt;project site&lt;/a&gt; with all sorts of nice information and reports (javadoc, unittests, codecoverage etc. depending on your configuration).&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;mvn install -Dmaven.test.skip=true&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Ah, the "skip test" parameter. I spent a long time figuring that one out. This is handy if you're working with several projects at the same time and you've (consciously) broken the build and want to keep on using the dependency.&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;mvn jetty:run&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;My new DataCleaner-webmonitor favourite. This will bring up a &lt;a href="http://www.mortbay.org/"&gt;Jetty container&lt;/a&gt; with DataCleaner-webmonitor running on localhost. This of course requires a little configuration in pom.xml, I'm sure you can figure it out, just find the plugin-elements that has to do with jetty :)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-643490514440646516?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/643490514440646516/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=643490514440646516' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/643490514440646516'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/643490514440646516'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/04/maven-ing-your-way-around-datacleaner.html' title='Maven-ing your way around DataCleaner'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5325006661168055984</id><published>2008-04-16T19:03:00.005+02:00</published><updated>2008-04-17T00:02:41.721+02:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='java'/><category scheme='http://www.blogger.com/atom/ns#' term='api'/><category scheme='http://www.blogger.com/atom/ns#' term='core'/><category scheme='http://www.blogger.com/atom/ns#' term='cleaner'/><category scheme='http://www.blogger.com/atom/ns#' term='intelligence'/><category scheme='http://www.blogger.com/atom/ns#' term='business'/><title type='text'>Traversing schemas with DataCleaner-core</title><content type='html'>Being a new framework a lot of you guys probably wonder how to use &lt;a href="http://www.eobjects.dk/datacleaner"&gt;DataCleaner &lt;/a&gt;as a Java API. Unlike a lot of other tools around that I've seen in the Business Intelligence domain DataCleaner was built bottom-up from a developers perspective and the User Interface was added on afterward so to use the DataCleaner-core API can be a real pleasure ... (so much for user-orientation, I'll have to elaborate on that another time)&lt;br /&gt;&lt;br /&gt;Let's take a look at how to get the data of a CSV file. This will give us the data of the file using the same interfaces as JDBC-databases, excel-files and possibly other data sources.&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;ISchemaFactory&lt;File&gt; schemaFactory = new CsvSchemaFactory();&lt;br /&gt;File file = new File("my_file.csv");&lt;br /&gt;ISchema[] schemas = schemaFactory.getSchemas(file);&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Or in the case of a JDBC connection:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;ISchemaFactory&lt;Connection&gt; schemaFactory = new JdbcSchemaFactory();&lt;br /&gt;Connection connection = DriverManager.getConnection("jdbc:my:database://localhost/foobar");&lt;br /&gt;ISchema[] schemas = schemaFactory.getSchemas(connection);&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;The schemas retrievede here can be accessed in a very natural way and with a &lt;a href="http://www.eobjects.dk/hudson/job/DataCleaner-core/ws/trunk/target/site/apidocs/dk/eobjects/datacleaner/metadata/package-summary.html"&gt;strong domain model&lt;/a&gt;, unlike traversing schemas in JDBC. Here's some examples:&lt;br /&gt;&lt;blockquote&gt;&lt;br /&gt;ITable[] tables = schemas[0].getTables();&lt;br /&gt;IColumn[] columns = tables[0].getColumns();&lt;br /&gt;String columnName = columns[0].getName();&lt;br /&gt;&lt;/blockquote&gt;&lt;br /&gt;Handling the schemas this way serves an obvious purpose. We can now design our profiles, our validation rules etc. in a very uniform way that can be reused accross data source types. We'll talk about that next time :)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/1301643147176033927-5325006661168055984?l=kasper.eobjects.org' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5325006661168055984/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=1301643147176033927&amp;postID=5325006661168055984' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5325006661168055984'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/posts/default/5325006661168055984'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2008/04/traversing-schemas-with-datacleaner.html' title='Traversing schemas with DataCleaner-core'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:total>0</thr:total></entry></feed>
