{ "metadata": { "name": "", "signature": "sha256:dff3e12b7160018f22a5c30b60c2d94c682bb83a1b63ac2fa5c5764fd2d6f28d" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "# Blaze - Symbolic Data Analysis\n", "\n", "We demonstrate Blaze's symbolic nature by comparing its use to Pandas." ] }, { "cell_type": "code", "collapsed": false, "input": [ "import blaze as bz\n", "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "bz.__version__" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": [ "'0.6.5'" ] } ], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### On The Surface\n", "\n", "Pandas and Blaze feel similar" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.read_csv('/home/mrocklin/workspace/blaze/examples/data/iris.csv')\n", "df.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
5 5.4 3.9 1.7 0.4 Iris-setosa
6 4.6 3.4 1.4 0.3 Iris-setosa
7 5.0 3.4 1.5 0.2 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
9 4.9 3.1 1.5 0.1 Iris-setosa
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5.0 3.6 1.4 0.2 Iris-setosa\n", "5 5.4 3.9 1.7 0.4 Iris-setosa\n", "6 4.6 3.4 1.4 0.3 Iris-setosa\n", "7 5.0 3.4 1.5 0.2 Iris-setosa\n", "8 4.4 2.9 1.4 0.2 Iris-setosa\n", "9 4.9 3.1 1.5 0.1 Iris-setosa" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "t = bz.Table('/home/mrocklin/workspace/blaze/examples/data/iris.csv')\n", "t.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
5 5.4 3.9 1.7 0.4 Iris-setosa
6 4.6 3.4 1.4 0.3 Iris-setosa
7 5.0 3.4 1.5 0.2 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
9 4.9 3.1 1.5 0.1 Iris-setosa
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5.0 3.6 1.4 0.2 Iris-setosa\n", "5 5.4 3.9 1.7 0.4 Iris-setosa\n", "6 4.6 3.4 1.4 0.3 Iris-setosa\n", "7 5.0 3.4 1.5 0.2 Iris-setosa\n", "8 4.4 2.9 1.4 0.2 Iris-setosa\n", "9 4.9 3.1 1.5 0.1 Iris-setosa" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "t.species.distinct()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
species
0 Iris-setosa
1 Iris-versicolor
2 Iris-virginica
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ " species\n", "0 Iris-setosa\n", "1 Iris-versicolor\n", "2 Iris-virginica" ] } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Both perform analytic queries\n", "\n", "For example, lets compute the minimum sepal length per species" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.groupby('species').sepal_length.min()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ "species\n", "Iris-setosa 4.3\n", "Iris-versicolor 4.9\n", "Iris-virginica 4.9\n", "Name: sepal_length, dtype: float64" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "bz.by(t.species, \n", " smallest=t.sepal_length.min())" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
speciessmallest
0 Iris-virginica 4.9
1 Iris-setosa 4.3
2 Iris-versicolor 4.9
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ " species smallest\n", "0 Iris-virginica 4.9\n", "1 Iris-setosa 4.3\n", "2 Iris-versicolor 4.9" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### But internally they're quite different\n", "\n", "As an example, lets see how they handle interaction with SQL databases" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SQL Interaction" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# Open up SQLAlchemy engine\n", "import sqlalchemy\n", "engine = sqlalchemy.create_engine('sqlite:////home/mrocklin/workspace/blaze/examples/data/iris.db')\n", "\n", "# Pull data from SQLite to Pandas\n", "df = pd.read_sql('SELECT * FROM iris', engine)\n", "df.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
5 5.4 3.9 1.7 0.4 Iris-setosa
6 4.6 3.4 1.4 0.3 Iris-setosa
7 5.0 3.4 1.5 0.2 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
9 4.9 3.1 1.5 0.1 Iris-setosa
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5.0 3.6 1.4 0.2 Iris-setosa\n", "5 5.4 3.9 1.7 0.4 Iris-setosa\n", "6 4.6 3.4 1.4 0.3 Iris-setosa\n", "7 5.0 3.4 1.5 0.2 Iris-setosa\n", "8 4.4 2.9 1.4 0.2 Iris-setosa\n", "9 4.9 3.1 1.5 0.1 Iris-setosa" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# Point Blaze to SQLite table\n", "t = bz.Table('sqlite:////home/mrocklin/workspace/blaze/examples/data/iris.db::iris')\n", "t.head(10)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
5 5.4 3.9 1.7 0.4 Iris-setosa
6 4.6 3.4 1.4 0.3 Iris-setosa
7 5.0 3.4 1.5 0.2 Iris-setosa
8 4.4 2.9 1.4 0.2 Iris-setosa
9 4.9 3.1 1.5 0.1 Iris-setosa
10 5.4 3.7 1.5 0.2 Iris-setosa
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5.0 3.6 1.4 0.2 Iris-setosa\n", "5 5.4 3.9 1.7 0.4 Iris-setosa\n", "6 4.6 3.4 1.4 0.3 Iris-setosa\n", "7 5.0 3.4 1.5 0.2 Iris-setosa\n", "8 4.4 2.9 1.4 0.2 Iris-setosa\n", "9 4.9 3.1 1.5 0.1 Iris-setosa\n", "..." ] } ], "prompt_number": 9 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### And again our analytic query" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.groupby('species').sepal_length.min()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "species\n", "Iris-setosa 4.3\n", "Iris-versicolor 4.9\n", "Iris-virginica 4.9\n", "Name: sepal_length, dtype: float64" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "bz.by(t.species, \n", " smallest=t.sepal_length.min())" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
speciessmallest
0 Iris-setosa 4.3
1 Iris-versicolor 4.9
2 Iris-virginica 4.9
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ " species smallest\n", "0 Iris-setosa 4.3\n", "1 Iris-versicolor 4.9\n", "2 Iris-virginica 4.9" ] } ], "prompt_number": 11 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Externally things looks the same" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Internally very different things happened\n", "\n", "Pandas pulled data from the SQLite database into local memory, then used pandas algorithms on that data.\n", "\n", "1. Pandas $\\leftarrow_\\textrm{data}$ SQLite\n", "2. Pandas did the work\n", "\n", "Blaze generated SQL and passed that back to the database\n", "\n", "1. Blaze $\\leftarrow_\\textrm{metadata}$ SQLite\n", "2. Blaze $\\rightarrow_\\textrm{SQL}$ SQLite\n", "3. SQLite did the work" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Lets see the generated SQL\n", "\n", "Blaze speaks SQLAlchemy" ] }, { "cell_type": "code", "collapsed": false, "input": [ "expr = bz.by(t.species, \n", " smallest=t.sepal_length.min())\n", "\n", "result = bz.compute(expr, {t: t.data.table})\n", "result" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "" ] } ], "prompt_number": 12 }, { "cell_type": "markdown", "metadata": {}, "source": [ "SQLAlchemy in turn speaks SQLite" ] }, { "cell_type": "code", "collapsed": false, "input": [ "print result" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "SELECT iris.species, min(iris.sepal_length) AS smallest \n", "FROM iris GROUP BY iris.species\n" ] } ], "prompt_number": 13 }, { "cell_type": "markdown", "metadata": {}, "source": [ "A more complex example in SQLAlchemy" ] }, { "cell_type": "code", "collapsed": false, "input": [ "print bz.compute(t[t.sepal_length > 5].species.count_values(), \n", " {t: t.data.table})" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "SELECT iris.species, count(iris.species) AS count \n", "FROM iris \n", "WHERE iris.sepal_length > :sepal_length_1 GROUP BY iris.species ORDER BY count DESC\n" ] } ], "prompt_number": 14 }, { "cell_type": "markdown", "metadata": {}, "source": [ "What users actually experience" ] }, { "cell_type": "code", "collapsed": false, "input": [ "t[t.sepal_length > 5].species.count_values()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
speciescount
0 Iris-virginica 49
1 Iris-versicolor 47
2 Iris-setosa 22
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ " species count\n", "0 Iris-virginica 49\n", "1 Iris-versicolor 47\n", "2 Iris-setosa 22" ] } ], "prompt_number": 15 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Main Points\n", "\n", "1. Blaze feels like NumPy/Pandas\n", "2. Today, Blaze doesn't perform computation\n", "3. Instead, Blaze manages metadata and APIs" ] } ], "metadata": {} } ] }