|
72 | 72 | "The following data sets need to be processed:\n", |
73 | 73 | "- Daily weather data (daily_weather.parquet)\n", |
74 | 74 | "- Cities (cities.csv)\n", |
75 | | - "- Countries (countries.csv)" |
| 75 | + "- Countries (countries.csv)\n", |
| 76 | + "\n", |
| 77 | + "The subsequent code cell acquires the dataset directly from kaggle.com.\n", |
| 78 | + "To properly configure the notebook to use corresponding credentials\n", |
| 79 | + "after signing up on Kaggle, define the `KAGGLE_USERNAME` and\n", |
| 80 | + "`KAGGLE_KEY` environment variables. Alternatively, put them into the\n", |
| 81 | + "file `~/.kaggle/kaggle.json` in your home folder, like this:\n", |
| 82 | + "```json\n", |
| 83 | + "{\n", |
| 84 | + " \"username\": \"acme\",\n", |
| 85 | + " \"key\": \"2b1dac2af55caaf1f34df76236fada4a\"\n", |
| 86 | + "}\n", |
| 87 | + "```\n", |
| 88 | + "Another variant is to acquire the dataset files manually, and extract\n", |
| 89 | + "them into a folder called `DOWNLOAD`. In this case, you can deactivate\n", |
| 90 | + "those two lines of code, in order to skip automatic dataset acquisition." |
76 | 91 | ] |
77 | 92 | }, |
| 93 | + { |
| 94 | + "cell_type": "code", |
| 95 | + "execution_count": null, |
| 96 | + "outputs": [], |
| 97 | + "source": [ |
| 98 | + "from cratedb_toolkit.datasets import load_dataset\n", |
| 99 | + "\n", |
| 100 | + "dataset = load_dataset(\"kaggle://guillemservera/global-daily-climate-data/daily_weather.parquet\")\n", |
| 101 | + "dataset.acquire()" |
| 102 | + ], |
| 103 | + "metadata": { |
| 104 | + "collapsed": false |
| 105 | + } |
| 106 | + }, |
78 | 107 | { |
79 | 108 | "cell_type": "code", |
80 | 109 | "execution_count": 88, |
81 | | - "id": "fa24e753", |
82 | | - "metadata": {}, |
83 | 110 | "outputs": [], |
84 | 111 | "source": [ |
85 | 112 | "from dask import dataframe as dd\n", |
|
88 | 115 | "# Show a progress bar for dask activities\n", |
89 | 116 | "pbar = ProgressBar()\n", |
90 | 117 | "pbar.register()" |
91 | | - ] |
| 118 | + ], |
| 119 | + "metadata": { |
| 120 | + "collapsed": false |
| 121 | + } |
92 | 122 | }, |
93 | 123 | { |
94 | 124 | "cell_type": "code", |
|
288 | 318 | ], |
289 | 319 | "source": [ |
290 | 320 | "# Load the parquet file. Please adjust the file path as needed.\n", |
291 | | - "df_kaggle = dd.read_parquet('DOWNLOAD_PATH/daily_weather.parquet')\n", |
| 321 | + "df_kaggle = dd.read_parquet('DOWNLOAD/daily_weather.parquet')\n", |
292 | 322 | "\n", |
293 | 323 | "# Show info about the data.\n", |
294 | 324 | "df_kaggle.info(verbose=True, memory_usage=True)\n", |
|
421 | 451 | ], |
422 | 452 | "source": [ |
423 | 453 | "# Read cities, adapt the path to the files accordingly\n", |
424 | | - "cities = dd.read_csv(\"DOWNLOAD_PATH/cities.csv\",dtype={'station_id': 'object'})\n", |
| 454 | + "cities = dd.read_csv(\"DOWNLOAD/cities.csv\",dtype={'station_id': 'object'})\n", |
425 | 455 | "\n", |
426 | 456 | "# Modify lon and lat of cities into an array that can be interpreted directly by CrateDB\n", |
427 | 457 | "def create_location_column(df):\n", |
|
442 | 472 | "outputs": [], |
443 | 473 | "source": [ |
444 | 474 | "# Read countries, adapt the path to the files accordingly\n", |
445 | | - "countries = dd.read_csv(\"DOWNLOAD_PATH/countries.csv\")" |
| 475 | + "countries = dd.read_csv(\"DOWNLOAD/countries.csv\")" |
446 | 476 | ] |
447 | 477 | }, |
448 | 478 | { |
|
476 | 506 | "metadata": {}, |
477 | 507 | "outputs": [], |
478 | 508 | "source": [ |
| 509 | + "import os\n", |
479 | 510 | "import sqlalchemy as sa\n", |
480 | 511 | "from crate.client.sqlalchemy.support import insert_bulk\n", |
481 | 512 | "\n", |
482 | | - "# Connect to CrateDB\n", |
483 | | - "# For a database running in the cloud, please use a connection string like this:\n", |
484 | | - "dburi = 'crate://USER:PASSWORD@HOST:4200?ssl=true'\n", |
| 513 | + "# Define database address when using CrateDB Cloud.\n", |
| 514 | + "# Please find these settings on your cluster overview page.\n", |
| 515 | + "CONNECTION_STRING = os.environ.get(\n", |
| 516 | + " \"CRATEDB_CONNECTION_STRING\",\n", |
| 517 | + " \"crate://<USER>:<PASSWORD>@<CRATEDB_HOST>/?ssl=true\",\n", |
| 518 | + ")\n", |
485 | 519 | "\n", |
486 | | - "# For a database running locally, please use the following connection string:\n", |
487 | | - "# dburi = 'crate://localhost:4200?ssl=false'\n", |
| 520 | + "# Define database address when using CrateDB on localhost.\n", |
| 521 | + "#CONNECTION_STRING = os.environ.get(\n", |
| 522 | + "# \"CRATEDB_CONNECTION_STRING\",\n", |
| 523 | + "# \"crate://crate@localhost/\",\n", |
| 524 | + "#)\n", |
488 | 525 | "\n", |
489 | | - "engine = sa.create_engine(dburi, echo=False)\n", |
| 526 | + "# Connect to CrateDB using SQLAlchemy.\n", |
| 527 | + "engine = sa.create_engine(CONNECTION_STRING, echo=False)\n", |
490 | 528 | "connection = engine.connect()" |
491 | 529 | ] |
492 | 530 | }, |
|
520 | 558 | ], |
521 | 559 | "source": [ |
522 | 560 | "connection.execute(sa.text(\"\"\"\n", |
523 | | - "CREATE TABLE IF NOT EXISTS \"doc\".\"weather_data\" (\n", |
| 561 | + "CREATE TABLE IF NOT EXISTS \"weather_data\" (\n", |
524 | 562 | " \"station_id\" TEXT,\n", |
525 | 563 | " \"city_name\" TEXT,\n", |
526 | 564 | " \"date\" TIMESTAMP WITHOUT TIME ZONE,\n", |
|
567 | 605 | ], |
568 | 606 | "source": [ |
569 | 607 | "connection.execute(sa.text(\"\"\"\n", |
570 | | - "CREATE TABLE \"doc\".\"cities\" (\n", |
| 608 | + "CREATE TABLE \"cities\" (\n", |
571 | 609 | " \"station_id\" TEXT,\n", |
572 | 610 | " \"city_name\" TEXT,\n", |
573 | 611 | " \"country\" TEXT,\n", |
|
626 | 664 | "# Uncomment the following lines to process the actual weather data.\n", |
627 | 665 | "# They have been disabled in order to avoid long-running operations.\n", |
628 | 666 | "# df_kaggle = df_kaggle.repartition(26)\n", |
629 | | - "# df_kaggle.to_sql(name='weather_data', uri=dburi, schema='doc', if_exists='append', \n", |
| 667 | + "# df_kaggle.to_sql(name='weather_data', uri=dburi, if_exists='append',\n", |
630 | 668 | "# index=False, chunksize=10000, parallel=True, method=insert_bulk)" |
631 | 669 | ] |
632 | 670 | }, |
|
659 | 697 | } |
660 | 698 | ], |
661 | 699 | "source": [ |
662 | | - "countries.to_sql('countries', dburi, schema='doc', if_exists='append', \n", |
| 700 | + "countries.to_sql('countries', CONNECTION_STRING, if_exists='append',\n", |
663 | 701 | " index=False, chunksize=1000, parallel=True, method=insert_bulk)" |
664 | 702 | ] |
665 | 703 | }, |
|
692 | 730 | } |
693 | 731 | ], |
694 | 732 | "source": [ |
695 | | - "cities.to_sql('cities', dburi, schema='doc', if_exists='append', \n", |
| 733 | + "cities.to_sql('cities', CONNECTION_STRING, if_exists='append',\n", |
696 | 734 | " index=False, chunksize=1000, parallel=True, method=insert_bulk)" |
697 | 735 | ] |
698 | 736 | } |
|
0 commit comments