<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/'><id>tag:blogger.com,1999:blog-1301643147176033927.post5302041660818199228..comments</id><updated>2011-12-23T20:55:50.462+01:00</updated><category term='images'/><category term='data integration'/><category term='extraction'/><category term='tools'/><category term='cli'/><category term='standardizing'/><category term='cleaner'/><category term='bug'/><category term='jndi'/><category term='create table'/><category term='development'/><category term='free'/><category term='winfried'/><category term='08'/><category term='community'/><category term='maven'/><category term='recognition'/><category term='proposal'/><category term='api'/><category term='add-on'/><category term='linkedin'/><category term='query'/><category term='motivation'/><category term='academia'/><category term='job'/><category term='heuristics'/><category term='styring'/><category term='tokens'/><category term='extension'/><category term='metaphone'/><category term='write'/><category term='fluent'/><category term='preprocessing'/><category term='2008'/><category term='table'/><category term='type parameter'/><category term='visualization'/><category term='names'/><category term='to'/><category term='java'/><category term='seam'/><category term='core'/><category term='example'/><category term='graphics'/><category term='transformation'/><category term='formatting'/><category term='hierarchy'/><category term='fetch'/><category term='similarity'/><category term='huge'/><category term='django'/><category term='mutable'/><category term='preview'/><category term='data profiling'/><category term='ui'/><category term='read'/><category term='interview'/><category term='push down'/><category term='report'/><category term='text'/><category term='first time right'/><category term='build'/><category term='optimization'/><category term='marketing'/><category term='large'/><category term='project'/><category term='open source data quality release announcement datacleaner 1.5.2'/><category term='blogging'/><category term='distinct'/><category term='van'/><category term='compiler'/><category term='exploration'/><category term='svn'/><category term='customer data'/><category term='flattening'/><category term='education'/><category term='thesis'/><category term='tango'/><category term='packaging'/><category term='flush'/><category term='unittest'/><category term='type-casting'/><category term='headless'/><category term='data warehousing'/><category term='openoffice'/><category term='data quality analysis'/><category term='rows'/><category term='architechture'/><category term='interface'/><category term='it'/><category term='excel'/><category term='flow'/><category term='data processing'/><category term='plugin'/><category term='metamodel'/><category term='planning'/><category term='user interface'/><category term='afløsningsopgave'/><category term='transformer'/><category term='regular'/><category term='image'/><category term='dansk'/><category term='scripts'/><category term='repository'/><category term='string analysis'/><category term='days'/><category term='manual'/><category term='promotion'/><category term='screen'/><category term='sas7bdat'/><category term='user experience'/><category term='speed'/><category term='master data management'/><category term='data format'/><category term='transaction'/><category term='pro'/><category term='engine'/><category term='xslt'/><category term='size'/><category term='meeting'/><category term='cube'/><category term='hudson'/><category term='danish'/><category term='human inference'/><category term='regex'/><category term='meta'/><category term='wikipedia'/><category term='weekdays'/><category term='sql'/><category term='identify'/><category term='awards'/><category term='data entry'/><category term='generated'/><category term='master'/><category term='analyzerbeans'/><category term='filtering'/><category term='data quality pro'/><category term='discussion'/><category term='standard measures'/><category term='display'/><category term='documentation'/><category term='swing'/><category term='unit'/><category term='profiler'/><category term='udvikling'/><category term='etl'/><category term='annotations'/><category term='finder'/><category term='ftr'/><category term='column'/><category term='functions'/><category term='open source'/><category term='iso date'/><category term='date'/><category term='insert'/><category term='jar'/><category term='presentation'/><category term='holland'/><category term='test'/><category term='choise'/><category term='css'/><category term='datasources'/><category term='intelligence'/><category term='ejb'/><category term='window'/><category term='humaninference'/><category term='tokenize'/><category term='performance'/><category term='group'/><category term='contact data'/><category term='notes'/><category term='multiple'/><category term='acquisition'/><category term='xml'/><category term='business'/><category term='select'/><category term='diy'/><category term='mysql'/><category term='ease of use'/><category term='lightning'/><category term='jdk'/><category term='sourceforge'/><category term='schema'/><category term='keynote'/><category term='look'/><category term='datastore'/><category term='explode'/><category term='date format'/><category term='maven2'/><category term='game'/><category term='olap'/><category term='great'/><category term='filter'/><category term='ear'/><category term='speak'/><category term='resultset'/><category term='frequency'/><category term='dqa'/><category term='book publish pentaho solutions open source business intelligence'/><category term='integration'/><category term='jpa'/><category term='explore'/><category term='persistence'/><category term='lgpl'/><category term='dependency'/><category term='quality'/><category term='screenshot'/><category term='result'/><category term='release'/><category term='detail'/><category term='jms'/><category term='verify'/><category term='sas'/><category term='prototype'/><category term='datasource'/><category term='value'/><category term='value distribution'/><category term='jdbc'/><category term='analyzer'/><category term='jfreechart'/><category term='board'/><category term='documents'/><category term='look and feel'/><category term='datacleaner'/><category term='fast'/><category term='benchmark'/><category term='business intelligence'/><category term='conference'/><category term='millions'/><category term='match'/><category term='evolution'/><category term='string'/><category term='n+1'/><category term='implement'/><category term='spreadsheet'/><category term='member'/><category term='python'/><category term='analysis'/><category term='javaone'/><category term='browser'/><category term='enterprise'/><category term='sassyreader'/><category term='domain'/><category term='open'/><category term='age'/><category term='csv'/><category term='rewriting'/><category term='phonetic'/><category term='file'/><category term='matching'/><category term='intranet'/><category term='database'/><category term='uispec4j'/><category term='linux'/><category term='recommendation'/><category term='hibernate'/><category term='dataprofiling'/><category term='data quality'/><category term='process'/><category term='mdx'/><category term='relational'/><category term='convert'/><category term='programming'/><category term='tutorial'/><category term='multithreading'/><category term='name'/><category term='goals'/><category term='expression'/><category term='mapping'/><category term='website'/><category term='chart'/><category term='levenshtein'/><category term='source'/><category term='expressions'/><category term='feature'/><category term='social construction'/><category term='soundex'/><category term='flushing'/><category term='datastores'/><category term='generics'/><category term='tool support'/><category term='dataqualitypro'/><category term='icon'/><category term='mondrian'/><category term='features'/><category term='jboss'/><category term='pattern'/><category term='eobjects'/><category term='article'/><category term='command line'/><category term='model'/><category term='data'/><category term='metadata'/><category term='drill'/><category term='profiling'/><category term='distribution'/><category term='profile'/><category term='discovery'/><title type='text'>Comments on kasper's source: Push down query optimization in DataCleaner</title><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/5302041660818199228/comments/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/5302041660818199228/comments/default'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/12/push-down-query-optimization-in.html'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>2</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-3184466546280546246</id><published>2011-12-23T20:55:50.462+01:00</published><updated>2011-12-23T20:55:50.462+01:00</updated><title type='text'>Glad you like the approach Matt :) Actually anothe...</title><content type='html'>Glad you like the approach Matt :) Actually another point that I wanted to add is that this optimization will also be pushed to non-RDBMS sources, eg. CSV files, since the DC query layer is programmatic, not SQL-based. So the materialization of the actual SQL query or the instructions to the CSV reader happens at a later stage (using MetaModel).&lt;br /&gt;&lt;br /&gt;Obviously not all filters can use this approach. We also have some filters like the JavaScript filter which are completely scriptable by the end user and that is not translatable to a query. In those cases we simply process the filter in the engine.&lt;br /&gt;&lt;br /&gt;You&amp;#39;re right in that often time it&amp;#39;s not even optimal to push transformations to the query. My general rule of thumb is: If it decreases the amount of data being transmitted (I/O), then do the optimization, if not, then do it in the row processing engine instead. I think you have the same strategy in Kettle? That&amp;#39;s why we so far we don&amp;#39;t have a &amp;quot;push down&amp;quot; optimization technique for transformers in DC, since these will grow the size (add columns) of the dataset and that will increase I/O between DB and client. And besides, DC&amp;#39;s (and Kettle&amp;#39;s) engine is multi-threaded, while most RDBMS&amp;#39;es dispatch every query in a single thread.</content><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/5302041660818199228/comments/default/3184466546280546246'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/5302041660818199228/comments/default/3184466546280546246'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/12/push-down-query-optimization-in.html?showComment=1324670150462#c3184466546280546246' title=''/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/02068277922010229877</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:in-reply-to xmlns:thr='http://purl.org/syndication/thread/1.0' href='http://kasper.eobjects.org/2011/12/push-down-query-optimization-in.html' ref='tag:blogger.com,1999:blog-1301643147176033927.post-5302041660818199228' source='http://www.blogger.com/feeds/1301643147176033927/posts/default/5302041660818199228' type='text/html'/><gd:extendedProperty xmlns:gd='http://schemas.google.com/g/2005' name='blogger.itemClass' value='pid-1290105647'/></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5199937220589366591</id><published>2011-12-23T17:47:22.563+01:00</published><updated>2011-12-23T17:47:22.563+01:00</updated><title type='text'>Nice!  For filters it makes sense in most but not ...</title><content type='html'>Nice!  For filters it makes sense in most but not all cases to filter in the database.  However, making this optional seems like the thing to do since there are a lot of RDBMS situations out there, indexes, nework speed and so on need to be considered.  &lt;br /&gt;The thing is, I have unfortunately seen a lot of cases where the database filtered, summed, grouped and counted slower than the data could be read over JDBC.  That in turn allows tools like DataCleaner and Kettle to work faster on the raw data in those cases... ;-)</content><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/5302041660818199228/comments/default/5199937220589366591'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/5302041660818199228/comments/default/5199937220589366591'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/12/push-down-query-optimization-in.html?showComment=1324658842563#c5199937220589366591' title=''/><author><name>Matt Casters</name><uri>http://www.blogger.com/profile/12263548900215476529</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='24' src='http://bp2.blogger.com/_cHg9rXowGtw/R6DPxIh4iEI/AAAAAAAAAAQ/a2PKY2431ys/S220/MattGardenSmall.png'/></author><thr:in-reply-to xmlns:thr='http://purl.org/syndication/thread/1.0' href='http://kasper.eobjects.org/2011/12/push-down-query-optimization-in.html' ref='tag:blogger.com,1999:blog-1301643147176033927.post-5302041660818199228' source='http://www.blogger.com/feeds/1301643147176033927/posts/default/5302041660818199228' type='text/html'/><gd:extendedProperty xmlns:gd='http://schemas.google.com/g/2005' name='blogger.itemClass' value='pid-2035751120'/></entry></feed>
