<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/'><id>tag:blogger.com,1999:blog-1301643147176033927.post2749725609255149078..comments</id><updated>2011-08-19T13:59:33.664+02:00</updated><category term='images'/><category term='data integration'/><category term='extraction'/><category term='tools'/><category term='cli'/><category term='standardizing'/><category term='cleaner'/><category term='bug'/><category term='jndi'/><category term='create table'/><category term='development'/><category term='free'/><category term='winfried'/><category term='08'/><category term='community'/><category term='maven'/><category term='recognition'/><category term='proposal'/><category term='api'/><category term='add-on'/><category term='linkedin'/><category term='query'/><category term='motivation'/><category term='academia'/><category term='job'/><category term='heuristics'/><category term='styring'/><category term='tokens'/><category term='extension'/><category term='metaphone'/><category term='write'/><category term='fluent'/><category term='preprocessing'/><category term='2008'/><category term='table'/><category term='type parameter'/><category term='visualization'/><category term='names'/><category term='to'/><category term='java'/><category term='seam'/><category term='core'/><category term='example'/><category term='graphics'/><category term='transformation'/><category term='formatting'/><category term='hierarchy'/><category term='fetch'/><category term='similarity'/><category term='huge'/><category term='django'/><category term='mutable'/><category term='preview'/><category term='data profiling'/><category term='ui'/><category term='read'/><category term='interview'/><category term='push down'/><category term='report'/><category term='text'/><category term='first time right'/><category term='build'/><category term='optimization'/><category term='marketing'/><category term='large'/><category term='project'/><category term='open source data quality release announcement datacleaner 1.5.2'/><category term='blogging'/><category term='distinct'/><category term='van'/><category term='compiler'/><category term='exploration'/><category term='svn'/><category term='customer data'/><category term='flattening'/><category term='education'/><category term='thesis'/><category term='tango'/><category term='packaging'/><category term='flush'/><category term='unittest'/><category term='type-casting'/><category term='headless'/><category term='data warehousing'/><category term='openoffice'/><category term='data quality analysis'/><category term='rows'/><category term='architechture'/><category term='interface'/><category term='it'/><category term='excel'/><category term='flow'/><category term='data processing'/><category term='plugin'/><category term='metamodel'/><category term='planning'/><category term='user interface'/><category term='afløsningsopgave'/><category term='transformer'/><category term='regular'/><category term='image'/><category term='dansk'/><category term='scripts'/><category term='repository'/><category term='string analysis'/><category term='days'/><category term='manual'/><category term='promotion'/><category term='screen'/><category term='sas7bdat'/><category term='user experience'/><category term='speed'/><category term='master data management'/><category term='data format'/><category term='transaction'/><category term='pro'/><category term='engine'/><category term='xslt'/><category term='size'/><category term='meeting'/><category term='cube'/><category term='hudson'/><category term='danish'/><category term='human inference'/><category term='regex'/><category term='meta'/><category term='wikipedia'/><category term='weekdays'/><category term='sql'/><category term='identify'/><category term='awards'/><category term='data entry'/><category term='generated'/><category term='master'/><category term='analyzerbeans'/><category term='filtering'/><category term='data quality pro'/><category term='discussion'/><category term='standard measures'/><category term='display'/><category term='documentation'/><category term='swing'/><category term='unit'/><category term='profiler'/><category term='udvikling'/><category term='etl'/><category term='annotations'/><category term='finder'/><category term='ftr'/><category term='column'/><category term='functions'/><category term='open source'/><category term='iso date'/><category term='date'/><category term='insert'/><category term='jar'/><category term='presentation'/><category term='holland'/><category term='test'/><category term='choise'/><category term='css'/><category term='datasources'/><category term='intelligence'/><category term='ejb'/><category term='window'/><category term='humaninference'/><category term='tokenize'/><category term='performance'/><category term='group'/><category term='contact data'/><category term='notes'/><category term='multiple'/><category term='acquisition'/><category term='xml'/><category term='business'/><category term='select'/><category term='diy'/><category term='mysql'/><category term='ease of use'/><category term='lightning'/><category term='jdk'/><category term='sourceforge'/><category term='schema'/><category term='keynote'/><category term='look'/><category term='datastore'/><category term='explode'/><category term='date format'/><category term='maven2'/><category term='game'/><category term='olap'/><category term='great'/><category term='filter'/><category term='ear'/><category term='speak'/><category term='resultset'/><category term='frequency'/><category term='dqa'/><category term='book publish pentaho solutions open source business intelligence'/><category term='integration'/><category term='jpa'/><category term='explore'/><category term='persistence'/><category term='lgpl'/><category term='dependency'/><category term='quality'/><category term='screenshot'/><category term='result'/><category term='release'/><category term='detail'/><category term='jms'/><category term='verify'/><category term='sas'/><category term='prototype'/><category term='datasource'/><category term='value'/><category term='value distribution'/><category term='jdbc'/><category term='analyzer'/><category term='jfreechart'/><category term='board'/><category term='documents'/><category term='look and feel'/><category term='datacleaner'/><category term='fast'/><category term='benchmark'/><category term='business intelligence'/><category term='conference'/><category term='millions'/><category term='match'/><category term='evolution'/><category term='string'/><category term='n+1'/><category term='implement'/><category term='spreadsheet'/><category term='member'/><category term='python'/><category term='analysis'/><category term='javaone'/><category term='browser'/><category term='enterprise'/><category term='sassyreader'/><category term='domain'/><category term='open'/><category term='age'/><category term='csv'/><category term='rewriting'/><category term='phonetic'/><category term='file'/><category term='matching'/><category term='intranet'/><category term='database'/><category term='uispec4j'/><category term='linux'/><category term='recommendation'/><category term='hibernate'/><category term='dataprofiling'/><category term='data quality'/><category term='process'/><category term='mdx'/><category term='relational'/><category term='convert'/><category term='programming'/><category term='tutorial'/><category term='multithreading'/><category term='name'/><category term='goals'/><category term='expression'/><category term='mapping'/><category term='website'/><category term='chart'/><category term='levenshtein'/><category term='source'/><category term='expressions'/><category term='feature'/><category term='social construction'/><category term='soundex'/><category term='flushing'/><category term='datastores'/><category term='generics'/><category term='tool support'/><category term='dataqualitypro'/><category term='icon'/><category term='mondrian'/><category term='features'/><category term='jboss'/><category term='pattern'/><category term='eobjects'/><category term='article'/><category term='command line'/><category term='model'/><category term='data'/><category term='metadata'/><category term='drill'/><category term='profiling'/><category term='distribution'/><category term='profile'/><category term='discovery'/><title type='text'>Comments on kasper's source: Pre-processing in DataCleaner 2: Why?</title><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://kasper.eobjects.org/feeds/2749725609255149078/comments/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html'/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>5</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>25</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5398504608091568008</id><published>2011-08-19T13:59:33.664+02:00</published><updated>2011-08-19T13:59:33.664+02:00</updated><title type='text'>For every kind of business it needs to be updated ...</title><content type='html'>For every kind of business it needs to be updated by using new software solutions. Some applications could be developed by outsource company that provides &lt;a href="http://sigmaukraine.Com/services/custom-software-development" rel="nofollow"&gt;custom software&lt;/a&gt; for business companies.</content><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/5398504608091568008'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/5398504608091568008'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html?showComment=1313755173664#c5398504608091568008' title=''/><author><name>Joshua Smith</name><uri>http://www.blogger.com/profile/09956978234577871712</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:in-reply-to xmlns:thr='http://purl.org/syndication/thread/1.0' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html' ref='tag:blogger.com,1999:blog-1301643147176033927.post-2749725609255149078' source='http://www.blogger.com/feeds/1301643147176033927/posts/default/2749725609255149078' type='text/html'/><gd:extendedProperty xmlns:gd='http://schemas.google.com/g/2005' name='blogger.itemClass' value='pid-512861786'/></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-321660676436239834</id><published>2011-03-17T14:30:04.347+01:00</published><updated>2011-03-17T14:30:04.347+01:00</updated><title type='text'>But honestly - any tool can do a SQL query. You ca...</title><content type='html'>But honestly - any tool can do a SQL query. You can invoke it using any command line tool that follows with your DB.&lt;br /&gt;&lt;br /&gt;And it&amp;#39;s not entirely true that SQL is always faster than Java. If you&amp;#39;re only looking for a single measure, then yes, it typically is. But if you want to retrieve a lot of measures from the same set of data, then it is typically faster to query once and share the result between multiple measure generators.&lt;br /&gt;&lt;br /&gt;Furthermore, SQL doesn&amp;#39;t work on anything but relational databases... I find that doing a DQA most often requires incorporating a lot of data from different stores.&lt;br /&gt;&lt;br /&gt;Monitoring is something completely different and whatever measures found there should be handled as temporal data stored for trends analysis etc. I don&amp;#39;t think a profiling tool will be the obvious choice for that.</content><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/321660676436239834'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/321660676436239834'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html?showComment=1300368604347#c321660676436239834' title=''/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:in-reply-to xmlns:thr='http://purl.org/syndication/thread/1.0' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html' ref='tag:blogger.com,1999:blog-1301643147176033927.post-2749725609255149078' source='http://www.blogger.com/feeds/1301643147176033927/posts/default/2749725609255149078' type='text/html'/><gd:extendedProperty xmlns:gd='http://schemas.google.com/g/2005' name='blogger.itemClass' value='pid-1572147945'/></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-2318018910543298687</id><published>2011-03-17T11:58:25.034+01:00</published><updated>2011-03-17T11:58:25.034+01:00</updated><title type='text'>rhaa, I lost my previous comment. So here is a sho...</title><content type='html'>rhaa, I lost my previous comment. So here is a shorter one:&lt;br /&gt;&lt;br /&gt;I just wanted to say that SQL queries are faster than a java program in general. And that they are useful when you need to monitor your data quality on a regular basis (Do I have some nulls data today?). &lt;br /&gt;&lt;br /&gt;On the other hand, complex analyses will require the data to be extracted from the datastore. In this case, SQL queries cannot be used. &lt;br /&gt;&lt;br /&gt;@Garima, give a try at Talend Open Profiler which provides both types of analysis.</content><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/2318018910543298687'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/2318018910543298687'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html?showComment=1300359505034#c2318018910543298687' title=''/><author><name>scorreiait</name><uri>http://scorreiait.wordpress.com/</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img1.blogblog.com/img/openid16-rounded.gif'/></author><thr:in-reply-to xmlns:thr='http://purl.org/syndication/thread/1.0' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html' ref='tag:blogger.com,1999:blog-1301643147176033927.post-2749725609255149078' source='http://www.blogger.com/feeds/1301643147176033927/posts/default/2749725609255149078' type='text/html'/><gd:extendedProperty xmlns:gd='http://schemas.google.com/g/2005' name='blogger.itemClass' value='pid-2104128081'/></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-5552873077021655462</id><published>2011-03-02T09:09:01.249+01:00</published><updated>2011-03-02T09:09:01.249+01:00</updated><title type='text'>Currently there is not a SQL console or something ...</title><content type='html'>Currently there is not a SQL console or something similar in DataCleaner. This was a quite conscious choice, because we want to provide something that will work the same for any database, or rather - datastore! A datastore might simply be a CSV file, or an Excel spreadsheet, so making a SQL console would be something only available for some of the datastore types.&lt;br /&gt;&lt;br /&gt;The other thing about doing SQL is that I don&amp;#39;t think it&amp;#39;s very good for exploration. In DataCleaner there are a lot of filters and transformation steps that you can add, and you can choose to preview and even subset your data (&amp;quot;Write to datastore&amp;quot;) after you&amp;#39;ve performed these steps. In my oppinion that&amp;#39;s a better way to do exploration.&lt;br /&gt;&lt;br /&gt;I&amp;#39;m not against making something for user-written queries, but it&amp;#39;s just not on the top of my radar screen. If someone wants to work on such a feature I think it&amp;#39;s great and I will do my best to help them out.&lt;br /&gt;&lt;br /&gt;Regarding your question about associated columns, the answer is that you need to include these columns in some kind of analysis. It could be a simple &amp;quot;String analyzer&amp;quot; or something like that. The reason is that the rows that are saved for inspection by the user are based on the rows that are available while DataCleaner is executing. At the same time, DataCleaner will only query for the data it needs. So this means if you need more data (columns) to be available, then you should make sure these data are included in the analysis.</content><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/5552873077021655462'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/5552873077021655462'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html?showComment=1299053341249#c5552873077021655462' title=''/><author><name>Kasper Sørensen</name><uri>http://www.blogger.com/profile/05310806961885711209</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='32' src='http://www.gravatar.com/avatar.php?gravatar_id=7dbffa8e2b37fb2f271faa3f2bec7215'/></author><thr:in-reply-to xmlns:thr='http://purl.org/syndication/thread/1.0' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html' ref='tag:blogger.com,1999:blog-1301643147176033927.post-2749725609255149078' source='http://www.blogger.com/feeds/1301643147176033927/posts/default/2749725609255149078' type='text/html'/><gd:extendedProperty xmlns:gd='http://schemas.google.com/g/2005' name='blogger.itemClass' value='pid-1572147945'/></entry><entry><id>tag:blogger.com,1999:blog-1301643147176033927.post-6875890137125488592</id><published>2011-03-02T08:21:51.344+01:00</published><updated>2011-03-02T08:21:51.344+01:00</updated><title type='text'>Does DalaCleaner support SQL queries? How can I is...</title><content type='html'>Does DalaCleaner support SQL queries? How can I issue SQL queries to explore data? Should it not be a part of a data analysis/profiling tool to allow to explore data?&lt;br /&gt;&lt;br /&gt;I can generate various profiles to explore data, but in the output, I can see only the column on which profiling was done. What should I do to see other associated column values too for that row?</content><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/6875890137125488592'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/1301643147176033927/2749725609255149078/comments/default/6875890137125488592'/><link rel='alternate' type='text/html' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html?showComment=1299050511344#c6875890137125488592' title=''/><author><name>Garima</name><uri>http://www.blogger.com/profile/08167520136802547297</uri><email>noreply@blogger.com</email><gd:image xmlns:gd='http://schemas.google.com/g/2005' rel='http://schemas.google.com/g/2005#thumbnail' width='16' height='16' src='http://img2.blogblog.com/img/b16-rounded.gif'/></author><thr:in-reply-to xmlns:thr='http://purl.org/syndication/thread/1.0' href='http://kasper.eobjects.org/2011/02/pre-processing-in-datacleaner-2-why.html' ref='tag:blogger.com,1999:blog-1301643147176033927.post-2749725609255149078' source='http://www.blogger.com/feeds/1301643147176033927/posts/default/2749725609255149078' type='text/html'/><gd:extendedProperty xmlns:gd='http://schemas.google.com/g/2005' name='blogger.itemClass' value='pid-271112042'/></entry></feed>
