diff --git a/Machine Learning/ch3.pandas/pandas_book_readers/book_readers.ipynb b/Machine Learning/ch3.pandas/pandas_book_readers/book_readers.ipynb new file mode 100644 index 0000000..ab0ea03 --- /dev/null +++ b/Machine Learning/ch3.pandas/pandas_book_readers/book_readers.ipynb @@ -0,0 +1,721 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "ddU9LZWUEiAd" + }, + "source": [ + "

\n", + "\n", + "قشر کتاب‌خوان\n", + "\n", + "

\n", + "\n", + "

\n", + "\n", + "در این تمرین برای کمک به قشر کتابخوان می‌خواهیم به تحلیل داده‌های سایت goodreads بپردازیم و اطلاعات جالب و کاربردی‌ای را از آن استخراج کنیم.\n", + "\n", + "

" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

\n", + "\n", + "مجموعه‌داده\n", + "\n", + "

\n", + "\n", + "

\n", + "\n", + "اطلاعات کتاب‌ها در فایل books_db.csv،‌ نویسندگان در فایل authors.csv، ناشران در فایل publisher.csv و زبان‌ها در فایل language.csv قرار گرفته است. به‌کمک پانداز هرکدام از این فایل‌ها را خوانده و در متغیر متناظر آن ذخیره کنید.\n", + "\n", + "

" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bookIDtitleaverage_ratingisbnisbn13num_pagesratings_counttext_reviews_countpublication_dateauthor_idlang_idpublisher_id
01Harry Potter and the Half-Blood Prince (Harry ...4.574397859609.780440e+126522095690275919/16/2006000
12Harry Potter and the Order of the Phoenix (Har...4.494393580789.780440e+128702153167292219/1/2004000
24Harry Potter and the Chamber of Secrets (Harry...4.424395548969.780440e+12352633324411/1/2003001
35Harry Potter and the Prisoner of Azkaban (Harr...4.56043965548X9.780440e+124352339585363255/1/2004000
48Harry Potter Boxed Set Books 1-5 (Harry Potte...4.784396825849.780440e+122690414281649/13/2004001
\n", + "
" + ], + "text/plain": [ + " bookID title average_rating \\\n", + "0 1 Harry Potter and the Half-Blood Prince (Harry ... 4.57 \n", + "1 2 Harry Potter and the Order of the Phoenix (Har... 4.49 \n", + "2 4 Harry Potter and the Chamber of Secrets (Harry... 4.42 \n", + "3 5 Harry Potter and the Prisoner of Azkaban (Harr... 4.56 \n", + "4 8 Harry Potter Boxed Set Books 1-5 (Harry Potte... 4.78 \n", + "\n", + " isbn isbn13 num_pages ratings_count text_reviews_count \\\n", + "0 439785960 9.780440e+12 652 2095690 27591 \n", + "1 439358078 9.780440e+12 870 2153167 29221 \n", + "2 439554896 9.780440e+12 352 6333 244 \n", + "3 043965548X 9.780440e+12 435 2339585 36325 \n", + "4 439682584 9.780440e+12 2690 41428 164 \n", + "\n", + " publication_date author_id lang_id publisher_id \n", + "0 9/16/2006 0 0 0 \n", + "1 9/1/2004 0 0 0 \n", + "2 11/1/2003 0 0 1 \n", + "3 5/1/2004 0 0 0 \n", + "4 9/13/2004 0 0 1 " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "df = pd.read_csv(\"./books_db.csv\")\n", + "authors = pd.read_csv(\"./authors.csv\")\n", + "language = pd.read_csv(\"./language.csv\")\n", + "publisher = pd.read_csv(\"./publisher.csv\")\n", + "\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

\n", + "\n", + "قسمت اول\n", + "\n", + "

\n", + "\n", + "

\n", + "\n", + "همان‌طور که مشاهده می‌کنید در مجموعه‌داده‌ی اصلی (کتاب‌ها)، مقادیر سه ستون lang_id، author_id و publisher_id تنها به‌صورت آیدی (id) هستند. این آیدی‌ها با شماره‌ی نمایه‌ها در سایر دیتافریم‌ها متناظر هستند. در این قسمت قصد داریم تا طبق این نمایه‌ها،‌ اطلاعات کامل‌شان را از دیتافریم مربوطه استخراج کرده و به دیتافریم df اضافه کنیم. بنابراین سه دیتافریم authors، language و publisher را به‌شکل مناسبی با دیتافریم df ادغام (merge) کنید.\n", + "\n", + "

" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bookIDtitleaverage_ratingisbnisbn13num_pagesratings_counttext_reviews_countpublication_dateauthor_idlang_idpublisher_idname_xcodename_y
01Harry Potter and the Half-Blood Prince (Harry ...4.574397859609.780440e+126522095690275919/16/2006000J.K. RowlingengScholastic Inc.
12Harry Potter and the Order of the Phoenix (Har...4.494393580789.780440e+128702153167292219/1/2004000J.K. RowlingengScholastic Inc.
35Harry Potter and the Prisoner of Azkaban (Harr...4.56043965548X9.780440e+124352339585363255/1/2004000J.K. RowlingengScholastic Inc.
283710546The Long-Lost Map (Ulysses Moore #2)4.00043977439X9.780440e+122721693767/1/2006128000Pierdomenico BaccalarioengScholastic Inc.
619023300The Door to Time (Ulysses Moore #1)3.934397743819.780440e+1223930992411/1/2006128000Pierdomenico BaccalarioengScholastic Inc.
\n", + "
" + ], + "text/plain": [ + " bookID title \\\n", + "0 1 Harry Potter and the Half-Blood Prince (Harry ... \n", + "1 2 Harry Potter and the Order of the Phoenix (Har... \n", + "3 5 Harry Potter and the Prisoner of Azkaban (Harr... \n", + "2837 10546 The Long-Lost Map (Ulysses Moore #2) \n", + "6190 23300 The Door to Time (Ulysses Moore #1) \n", + "\n", + " average_rating isbn isbn13 num_pages ratings_count \\\n", + "0 4.57 439785960 9.780440e+12 652 2095690 \n", + "1 4.49 439358078 9.780440e+12 870 2153167 \n", + "3 4.56 043965548X 9.780440e+12 435 2339585 \n", + "2837 4.00 043977439X 9.780440e+12 272 1693 \n", + "6190 3.93 439774381 9.780440e+12 239 3099 \n", + "\n", + " text_reviews_count publication_date author_id lang_id publisher_id \\\n", + "0 27591 9/16/2006 0 0 0 \n", + "1 29221 9/1/2004 0 0 0 \n", + "3 36325 5/1/2004 0 0 0 \n", + "2837 76 7/1/2006 1280 0 0 \n", + "6190 241 1/1/2006 1280 0 0 \n", + "\n", + " name_x code name_y \n", + "0 J.K. Rowling eng Scholastic Inc. \n", + "1 J.K. Rowling eng Scholastic Inc. \n", + "3 J.K. Rowling eng Scholastic Inc. \n", + "2837 Pierdomenico Baccalario eng Scholastic Inc. \n", + "6190 Pierdomenico Baccalario eng Scholastic Inc. " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.merge(df,authors,left_on=\"author_id\",right_index=True)\n", + "df = pd.merge(df,language,left_on=\"lang_id\",right_index=True)\n", + "df = pd.merge(df,publisher,left_on=\"publisher_id\",right_index=True)\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

\n", + "\n", + "حال که تمام مقادیر اضافه شده‌اند، نیازی به سه ستون id که به‌عنوان کلید استفاده کردیم نداریم. پس آن‌ها را با استفاده از دستور drop حذف می‌کنیم.\n", + " در درسنامه‌های بعدی نحوه‌ی کار با این دستور را یاد خواهید گرفت. \n", + "\n", + "

" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(columns = ['author_id', 'lang_id', 'publisher_id'], inplace = True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

\n", + "\n", + "قسمت دوم\n", + "\n", + "

\n", + "\n", + "

\n", + "\n", + "حال نام ستون‌های دیتافریم را به‌گونه‌ای تغییر دهید که به‌ترتیب از چپ به راست برابر نام‌های زیر باشد:\n", + "