{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge database tables into a flat dataframe" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.max_colwidth\", 60)\n", "pd.set_option(\"display.max_rows\", 100)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "RELEASE_DB_NAME = \"release_db.sqlite\"\n", "conn = sqlite3.connect(RELEASE_DB_NAME)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read in the tables" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# we'll merge everything into this dataframe\n", "df = pd.read_sql_query(\"SELECT * FROM policy_snapshots\", conn)\n", "\n", "sites_df = pd.read_sql_query(\"SELECT * FROM sites\", conn)\n", "policy_texts_df = pd.read_sql_query(\"SELECT * FROM policy_texts\", conn)\n", "alexa_ranks_df = pd.read_sql_query(\"SELECT * FROM alexa_ranks\", conn)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Left join with policy text table\n", "df = pd.merge(df, policy_texts_df, how=\"left\", left_on=\"policy_text_id\", right_on=\"id\")\n", "\n", "# Left join with sites table\n", "df = pd.merge(df, sites_df, how=\"left\", left_on=\"site_id\", right_on=\"id\")\n", "\n", "# Left join with alexa ranks table\n", "df = pd.merge(df, alexa_ranks_df, how=\"left\", on=['site_id', 'year', 'phase'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_xsite_idhomepage_snapshot_urlpolicy_snapshot_urlpolicy_urlhomepage_snapshot_redirected_urlyearphasepolicy_text_idpolicy_html_id...flesch_kincaidsmogflesch_easelengthsha1simhashiddomaincategoriesrank
0131160https://web.archive.org/web/20031122123614id_/http%3A//d...https://web.archive.org/web/20031207185149id_/http%3A//w...docusign.com/eDocuSign/privacy.asphttps://web.archive.org/web/20031122123614id_/http://www...2003B11...14.93838418.351184difficult1413737519a313f07f98fdbb51f13779ed7d6cb1b446886296459503754419531160docusign.combusiness;informationtechNaN
1231160https://web.archive.org/web/20040924214154id_/http%3A//d...https://web.archive.org/web/20040816044548id_/http%3A//w...docusign.com/privacy.asphttps://web.archive.org/web/20041014005441id_/http://www...2004B22...13.76220218.171460difficult14366837c2e1fdab142cebb3cc008880309ee94a6ecbf86352740756872371931160docusign.combusiness;informationtechNaN
2331160https://web.archive.org/web/20061026200451id_/http%3A//d...https://web.archive.org/web/20061219215634id_/http%3A//w...docusign.com/resources/privacy_policy.phphttps://web.archive.org/web/20061026200451id_/http://www...2006B33...13.64040718.062587difficult143639296f990b576e40dec5d0e54552918247192bd8f432172897143590887131160docusign.combusiness;informationtechNaN
\n", "

3 rows × 31 columns

\n", "
" ], "text/plain": [ " id_x site_id homepage_snapshot_url \\\n", "0 1 31160 https://web.archive.org/web/20031122123614id_/http%3A//d... \n", "1 2 31160 https://web.archive.org/web/20040924214154id_/http%3A//d... \n", "2 3 31160 https://web.archive.org/web/20061026200451id_/http%3A//d... \n", "\n", " policy_snapshot_url \\\n", "0 https://web.archive.org/web/20031207185149id_/http%3A//w... \n", "1 https://web.archive.org/web/20040816044548id_/http%3A//w... \n", "2 https://web.archive.org/web/20061219215634id_/http%3A//w... \n", "\n", " policy_url \\\n", "0 docusign.com/eDocuSign/privacy.asp \n", "1 docusign.com/privacy.asp \n", "2 docusign.com/resources/privacy_policy.php \n", "\n", " homepage_snapshot_redirected_url year phase \\\n", "0 https://web.archive.org/web/20031122123614id_/http://www... 2003 B \n", "1 https://web.archive.org/web/20041014005441id_/http://www... 2004 B \n", "2 https://web.archive.org/web/20061026200451id_/http://www... 2006 B \n", "\n", " policy_text_id policy_html_id ... flesch_kincaid smog flesch_ease \\\n", "0 1 1 ... 14.938384 18.351184 difficult \n", "1 2 2 ... 13.762202 18.171460 difficult \n", "2 3 3 ... 13.640407 18.062587 difficult \n", "\n", " length sha1 simhash \\\n", "0 14137 37519a313f07f98fdbb51f13779ed7d6cb1b4468 862964595037544195 \n", "1 14366 837c2e1fdab142cebb3cc008880309ee94a6ecbf 863527407568723719 \n", "2 14363 9296f990b576e40dec5d0e54552918247192bd8f 4321728971435908871 \n", "\n", " id domain categories rank \n", "0 31160 docusign.com business;informationtech NaN \n", "1 31160 docusign.com business;informationtech NaN \n", "2 31160 docusign.com business;informationtech NaN \n", "\n", "[3 rows x 31 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }