From d6efec41983ea135ff8ba5882c5643285e3b9f01 Mon Sep 17 00:00:00 2001 From: RealHaks Date: Sun, 4 Mar 2018 22:02:33 +0100 Subject: [PATCH] Get ratings and get rated sales methods written --- JumiaPy/Jumia-Data-Scraper.ipynb | 217 ++++++++++++++++++++++++++++--- 1 file changed, 200 insertions(+), 17 deletions(-) diff --git a/JumiaPy/Jumia-Data-Scraper.ipynb b/JumiaPy/Jumia-Data-Scraper.ipynb index dcce25d..b4e58e9 100644 --- a/JumiaPy/Jumia-Data-Scraper.ipynb +++ b/JumiaPy/Jumia-Data-Scraper.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ " self.links = []\n", " self.prices = []\n", " self.ratings = []\n", - " self.sales = []\n", + " self.rated_sales = []\n", " \n", " \n", " def get_pages(self):\n", @@ -109,7 +109,8 @@ " \"\"\"\n", " \n", " for page in self.pages:\n", - " self.products += each_page.find_all(\"div\", {\"class\": re.compile(r\"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)\")}) \n", + " class_value = re.compile(r\"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)|(?:^sku -gallery.+$)\")\n", + " self.products += each_page.find_all(\"div\", {\"class\": class_value}) \n", " \n", " def get_links(self):\n", " \"\"\"\n", @@ -163,8 +164,17 @@ " It returns a list of the product ratings\n", " \n", " \"\"\"\n", + " \n", + " for product in self.products:\n", + " if each.find(\"div\", {\"class\": \"stars\"}) == None:\n", + " self.ratings.append(\"No Rating\")\n", + " else:\n", + " rating_tag = each.find(\"div\", {\"class\": \"stars\"})[\"style\"]\n", + " rating_re = re.compile(r\"[0-9]+\")\n", + " stars = format(int(rating_re.findall(rating_tag)[0])/100 * 5, '.2f')\n", + " self.ratings.append(stars)\n", " \n", - " def get_sales():\n", + " def get_rated_sales():\n", " \"\"\"\n", " \n", " It takes in a list of html pages already parsed by the get_pages method\n", @@ -175,12 +185,18 @@ " \n", " \"\"\"\n", "\n", - " " + " for product in self.products:\n", + " if product.find(\"div\", {\"class\": \"total-ratings\"}) == None:\n", + " self.rated_sales.append(\"No Rated Sales\")\n", + " else:\n", + " rated_sales_re = re.compile(r\"[0-9]+\")\n", + " dirty_sales = each.find(\"div\", {\"class\": \"total-ratings\"}).text\n", + " print(rated_sales_re.findall(dirty_sales)[0])" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -189,27 +205,69 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 30, "metadata": { "scrolled": false }, "outputs": [ { - "data": { - "text/plain": [ - "'10,510'" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "379\n", + "No Rated Sales\n", + "No Rated Sales\n", + "122\n", + "29\n", + "171\n", + "3\n", + "146\n", + "226\n", + "182\n", + "45\n", + "523\n", + "93\n", + "214\n", + "65\n", + "805\n", + "55\n", + "51\n", + "No Rated Sales\n", + "132\n", + "78\n", + "8\n", + "2\n", + "8\n", + "798\n", + "6\n", + "22\n", + "35\n", + "3\n", + "No Rated Sales\n", + "2\n", + "No Rated Sales\n", + "28\n", + "16\n", + "61\n", + "No Rated Sales\n", + "69\n", + "25\n", + "13\n", + "477\n" + ] } ], "source": [ "ade2 = ade.content\n", "ade_html = bso(ade2, \"html.parser\").find(\"section\", {\"class\":\"products -mabaya\"})\n", - "ade3 = ade_html.find_all(\"div\", {\"class\": re.compile(r\"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)\")})\n", - "ade3[0].find(\"span\", {\"data-price\": re.compile(r\"\\d+\")}).text\n", + "ade3 = ade_html.find_all(\"div\", {\"class\": re.compile(r\"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)|(?:^sku -gallery.+$)\")})\n", + "for each in ade3:\n", + " if each.find(\"div\", {\"class\": \"total-ratings\"}) == None:\n", + " print(\"No Rated Sales\")\n", + " else:\n", + " rated_sales_re = re.compile(r\"[0-9]+\")\n", + " dirty_sales = each.find(\"div\", {\"class\": \"total-ratings\"}).text\n", + " print(rated_sales_re.findall(dirty_sales)[0])\n", "#.find(\"span\", {\"class\": \"price \"}).get_text()\n", "#.find(\"span\", {\"data-price\": re.compile(r\"\\d+\")}).text" ] @@ -242,6 +300,131 @@ "print(requests.__version__)" ] }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['__abs__',\n", + " '__add__',\n", + " '__bool__',\n", + " '__class__',\n", + " '__delattr__',\n", + " '__dir__',\n", + " '__divmod__',\n", + " '__doc__',\n", + " '__eq__',\n", + " '__float__',\n", + " '__floordiv__',\n", + " '__format__',\n", + " '__ge__',\n", + " '__getattribute__',\n", + " '__getformat__',\n", + " '__getnewargs__',\n", + " '__gt__',\n", + " '__hash__',\n", + " '__init__',\n", + " '__int__',\n", + " '__le__',\n", + " '__lt__',\n", + " '__mod__',\n", + " '__mul__',\n", + " '__ne__',\n", + " '__neg__',\n", + " '__new__',\n", + " '__pos__',\n", + " '__pow__',\n", + " '__radd__',\n", + " '__rdivmod__',\n", + " '__reduce__',\n", + " '__reduce_ex__',\n", + " '__repr__',\n", + " '__rfloordiv__',\n", + " '__rmod__',\n", + " '__rmul__',\n", + " '__round__',\n", + " '__rpow__',\n", + " '__rsub__',\n", + " '__rtruediv__',\n", + " '__setattr__',\n", + " '__setformat__',\n", + " '__sizeof__',\n", + " '__str__',\n", + " '__sub__',\n", + " '__subclasshook__',\n", + " '__truediv__',\n", + " '__trunc__',\n", + " 'as_integer_ratio',\n", + " 'conjugate',\n", + " 'fromhex',\n", + " 'hex',\n", + " 'imag',\n", + " 'is_integer',\n", + " 'real']" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dir(float)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "asd = \"(234)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "Can't convert 'list' object to str implicitly", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0masf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0masd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"(\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\")\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m: Can't convert 'list' object to str implicitly" + ] + } + ], + "source": [ + "asf = asd.replace([\"(\",\")\"], \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'234)'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "asf" + ] + }, { "cell_type": "code", "execution_count": null,