{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge database tables into a flat dataframe" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.max_colwidth\", 60)\n", "pd.set_option(\"display.max_rows\", 100)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "RELEASE_DB_NAME = \"release_db.sqlite\"\n", "conn = sqlite3.connect(RELEASE_DB_NAME)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Read in the tables" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# we'll merge everything into this dataframe\n", "df = pd.read_sql_query(\"SELECT * FROM policy_snapshots\", conn)\n", "\n", "sites_df = pd.read_sql_query(\"SELECT * FROM sites\", conn)\n", "policy_texts_df = pd.read_sql_query(\"SELECT * FROM policy_texts\", conn)\n", "alexa_ranks_df = pd.read_sql_query(\"SELECT * FROM alexa_ranks\", conn)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Left join with policy text table\n", "df = pd.merge(df, policy_texts_df, how=\"left\", left_on=\"policy_text_id\", right_on=\"id\")\n", "\n", "# Left join with sites table\n", "df = pd.merge(df, sites_df, how=\"left\", left_on=\"site_id\", right_on=\"id\")\n", "\n", "# Left join with alexa ranks table\n", "df = pd.merge(df, alexa_ranks_df, how=\"left\", on=['site_id', 'year', 'phase'])" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id_x | \n", "site_id | \n", "homepage_snapshot_url | \n", "policy_snapshot_url | \n", "policy_url | \n", "homepage_snapshot_redirected_url | \n", "year | \n", "phase | \n", "policy_text_id | \n", "policy_html_id | \n", "... | \n", "flesch_kincaid | \n", "smog | \n", "flesch_ease | \n", "length | \n", "sha1 | \n", "simhash | \n", "id | \n", "domain | \n", "categories | \n", "rank | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "31160 | \n", "https://web.archive.org/web/20031122123614id_/http%3A//d... | \n", "https://web.archive.org/web/20031207185149id_/http%3A//w... | \n", "docusign.com/eDocuSign/privacy.asp | \n", "https://web.archive.org/web/20031122123614id_/http://www... | \n", "2003 | \n", "B | \n", "1 | \n", "1 | \n", "... | \n", "14.938384 | \n", "18.351184 | \n", "difficult | \n", "14137 | \n", "37519a313f07f98fdbb51f13779ed7d6cb1b4468 | \n", "862964595037544195 | \n", "31160 | \n", "docusign.com | \n", "business;informationtech | \n", "NaN | \n", "
1 | \n", "2 | \n", "31160 | \n", "https://web.archive.org/web/20040924214154id_/http%3A//d... | \n", "https://web.archive.org/web/20040816044548id_/http%3A//w... | \n", "docusign.com/privacy.asp | \n", "https://web.archive.org/web/20041014005441id_/http://www... | \n", "2004 | \n", "B | \n", "2 | \n", "2 | \n", "... | \n", "13.762202 | \n", "18.171460 | \n", "difficult | \n", "14366 | \n", "837c2e1fdab142cebb3cc008880309ee94a6ecbf | \n", "863527407568723719 | \n", "31160 | \n", "docusign.com | \n", "business;informationtech | \n", "NaN | \n", "
2 | \n", "3 | \n", "31160 | \n", "https://web.archive.org/web/20061026200451id_/http%3A//d... | \n", "https://web.archive.org/web/20061219215634id_/http%3A//w... | \n", "docusign.com/resources/privacy_policy.php | \n", "https://web.archive.org/web/20061026200451id_/http://www... | \n", "2006 | \n", "B | \n", "3 | \n", "3 | \n", "... | \n", "13.640407 | \n", "18.062587 | \n", "difficult | \n", "14363 | \n", "9296f990b576e40dec5d0e54552918247192bd8f | \n", "4321728971435908871 | \n", "31160 | \n", "docusign.com | \n", "business;informationtech | \n", "NaN | \n", "
3 rows × 31 columns
\n", "