diff --git a/README.md b/README.md index 0ac071a..35ca5ac 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Then install required python packages ## Applications -okCupid provides self descriptions, selfies and big questionnaires that are really interesting for anyone interested in psychometrics. +okCupid provides self descriptions, selfies and big questionnaires that are really useful for anyone interested in psychometrics. For now I just used it to scrape self descriptions and train an AI to generate new ones. You can check more about it [here](https://mathigatti.com/2021/02/15/okcupid-synthetic-profiles/). @@ -28,7 +28,7 @@ This scraper has two script, the first one downloads the profile data (except th Using this script and changing your profile details, like gender, sexual orientation and location you can scrape pretty much all users in a given location in okCupid. -You can run it like this +You can run it like this, users data will be downloaded into _users_ folder - python users_by_discover.py @@ -36,8 +36,14 @@ You can also try the _users_by_question.py_ script, it search for users that ans ### Download questions +You can run it like this, users answers will be downloaded into _answers_ folder + - python users_by_question.py +### Parsing data + +In the testing.ipynb notebook you can check some examples of how to process the data. Users data is downloaded as HTML so I use beautifulSoup to parse it and extract the relevant information. Users questions are in JSON format so it's easier to process. + ## Extra ### Related datasets diff --git a/testing.ipynb b/testing.ipynb new file mode 100644 index 0000000..5a3d5fc --- /dev/null +++ b/testing.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parsing HTML files containing users data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 21628/21628 [02:31<00:00, 142.31it/s]\n" + ] + } + ], + "source": [ + "from bs4 import BeautifulSoup\n", + "from glob import glob\n", + "import json\n", + "from collections import defaultdict\n", + "import random\n", + "import json\n", + "from tqdm import tqdm\n", + "\n", + "def looking_for(text):\n", + " text = text.replace(\"& \", \"\")\n", + " text = text.split(\"Looking for \")[-1]\n", + " if \" | \" in text: \n", + " gender = text.split(\" | \")[0].split(\", \")\n", + " relationship_type = text.split(\" | \")[1].split(\", \")\n", + " return gender, relationship_type\n", + " else:\n", + " if \"man\" in text.lower():\n", + " return text.split(\", \"), \"\"\n", + " else:\n", + " return \"\", text.split(\", \")\n", + " \n", + "with open(\"title2topic.json\",'r') as f:\n", + " title2topic = json.load(f)\n", + "\n", + "basic_values = {\n", + " \"name\": (\"profile-basics-username-text\", \"cardsummary-item cardsummary-realname\"),\n", + " \"age\": (\"profile-basics-asl-age\", \"cardsummary-item cardsummary-age\"),\n", + " \"location\": (\"profile-basics-asl-location\", \"cardsummary-item cardsummary-location\")\n", + "}\n", + "\n", + "essays_class = (\"profile-essay\", \"qmessays-essay\")\n", + "image_class = (\"profile-thumb\", \"qmcard-carousel-viewport-inner\")\n", + "\n", + "users = glob(\"users/*.html\")\n", + "random.shuffle(users)\n", + "\n", + "users_basics = []\n", + "\n", + "for user in tqdm(users):\n", + " with open(user,'r') as f:\n", + " data = f.read()\n", + " basics = {}\n", + "\n", + " user_id = user[6:-5]\n", + " soup = BeautifulSoup(data)\n", + "\n", + " r = soup.find(\"span\",{\"class\":\"profile-basics-username-text\"})\n", + "\n", + " if r:\n", + " index = 0\n", + " else:\n", + " index = 1\n", + "\n", + " for name, value in basic_values.items():\n", + " basics[name] = soup.find(\"span\",{\"class\": value[index]}).text\n", + " basics[\"age\"] = int(basics[\"age\"])\n", + "\n", + " basics = {}\n", + " values = [v.text for v in soup.find_all(\"div\", {\"class\": \"matchprofile-details-text\"})]\n", + " \n", + " first = 0\n", + " for term in [\"VACCINATED\"]:\n", + " if term in values[first]:\n", + " first += 1\n", + "\n", + " sexual_basics = values[first].split(\" | \")\n", + " \n", + " if len(sexual_basics) == 3:\n", + " gender, sexual_orientation, relationship_type = sexual_basics\n", + " elif len(sexual_basics) == 4:\n", + " gender, sexual_orientation, bdsm, relationship_type = sexual_basics\n", + " elif len(sexual_basics) == 2:\n", + " gender, relationship_type = sexual_basics\n", + " relationship_type = \"\"\n", + " else:\n", + " print(values[0])\n", + " continue\n", + "\n", + " lf_gender, lf_relationship_type = looking_for(values[-1])\n", + " basics[\"user_id\"] = user_id\n", + " basics[\"gender\"] = gender\n", + " basics[\"sexual_orientation\"] = sexual_orientation\n", + " basics[\"relationship_type\"] = relationship_type\n", + " basics[\"looking_for_gender\"] = lf_gender\n", + " basics[\"looking_for_relationship_type\"] = lf_relationship_type\n", + " basics[\"extra\"] = \" | \".join(values[first:-1])\n", + " users_basics.append(basics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "df = pd.DataFrame(users_basics)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "essays = []\n", + "for answer in soup.find_all(\"div\", {\"class\": essays_class[index]}):\n", + " if index == 1:\n", + " try:\n", + " title = answer.find(\"h2\").text\n", + " except:\n", + " continue\n", + " topic = title2topic[title]\n", + " else:\n", + " try:\n", + " topic, title = [t.text for t in answer.find_all(\"h2\")]\n", + " except:\n", + " continue\n", + "\n", + " essays.append( {\"user_id\": user_id, \"topic\": topic, \"title\": title, \"answer\": answer.find(\"p\").text.strip()} )\n", + "\n", + "essays = pd.DataFrame(essays)\n", + "essays.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 461, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 461, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df[[\"age\",\"user_id\"]].groupby(\"age\").count().plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parsing JSON files containing users answers" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 21628/21628 [00:53<00:00, 407.71it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "expected_answers = []\n", + "personal_answers = []\n", + "\n", + "question2id = {}\n", + "\n", + "for user in tqdm(users):\n", + " user_id = user[6:-5]\n", + "\n", + " with open(f\"answers/{user_id}.json\",'r') as f:\n", + " data = f.read()\n", + "\n", + " try:\n", + " questions = json.loads(data)\n", + " except:\n", + " continue\n", + "\n", + " for question in questions:\n", + " importance = question[\"target\"][\"importance\"]\n", + " accepts = question[\"target\"][\"accepts\"]\n", + " answer = question[\"target\"][\"answer\"]\n", + "\n", + " question_id = question[\"question\"][\"id\"]\n", + "\n", + " personal_answer = {\n", + " \"user_id\": user_id,\n", + " \"question_id\": question_id,\n", + " \"answer\": answer,\n", + " \"importance\": importance\n", + " }\n", + " \n", + " personal_answers.append(personal_answer)\n", + "\n", + " question2id[question[\"question\"][\"text\"]] = question_id\n", + " for accept in accepts:\n", + " next_answer = {\n", + " \"user_id\": user_id,\n", + " \"question_id\": question_id,\n", + " \"accepts\": accept,\n", + " \"importance\": importance\n", + " }\n", + " \n", + " expected_answers.append(next_answer)\n", + "\n", + "expected_answers = pd.DataFrame(expected_answers)\n", + "personal_answers = pd.DataFrame(personal_answers)" + ] + }, + { + "cell_type": "code", + "execution_count": 486, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Could you be in a committed relationship with someone who regularly smokes pot (> 2 times a week)?',\n", + " 462356),\n", + " ('Have you smoked a cigarette in the last 6 months?', 501),\n", + " ('Regardless of whether or not you smoke marijuana, do you think it should be legalized for adults?',\n", + " 341),\n", + " ('Would you go out with a smoker?', 13006),\n", + " ('You witness your next door neighbors discreetly smoking marijuana on their own property. How do you respond?',\n", + " 47589),\n", + " ('What do you think of laws that make smoking illegal in bars and restaurants?',\n", + " 13054),\n", + " ('How often do you smoke cigars?', 80621),\n", + " ('Do you smoke Weed?', 119507),\n", + " ('If you had a potentially harmful addiction such as smoking, drinking, or drugs, how do you think you would you feel if a significant other tried to get you to stop?',\n", + " 36331),\n", + " ('Do you think parents who smoke in cars with very young children should be punished?',\n", + " 51053),\n", + " ('If you caught your child smoking pot what would you do?', 21487),\n", + " ('Do you think it is acceptable to smoke tobacco in front of children?',\n", + " 43545),\n", + " ('Do you smoke cigarettes?', 103168),\n", + " ('All things being equal, would you prefer your ideal partner to be a smoker?',\n", + " 102016),\n", + " ('Do you think smoking in Malaysia should be banned in eateries?',\n", + " 1174168646),\n", + " ('Do you smoke?', 133887)]" + ] + }, + "execution_count": 486, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(text, question_id) for text, question_id in question2id.items() if \"smok\" in text]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "personal_answers[personal_answers[\"question_id\"] == 462356][[\"answer\",\"user_id\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Testing stuff\n", + "\n", + "## Search users by keyword" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from users_by_question import keyword_search\n", + "keyword_search(\"420\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}