Skip to content

Commit

Permalink
Get ratings and get rated sales methods written
Browse files Browse the repository at this point in the history
  • Loading branch information
HAKSOAT committed Mar 4, 2018
1 parent 99ab129 commit d6efec4
Showing 1 changed file with 200 additions and 17 deletions.
217 changes: 200 additions & 17 deletions JumiaPy/Jumia-Data-Scraper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -77,7 +77,7 @@
" self.links = []\n",
" self.prices = []\n",
" self.ratings = []\n",
" self.sales = []\n",
" self.rated_sales = []\n",
" \n",
" \n",
" def get_pages(self):\n",
Expand Down Expand Up @@ -109,7 +109,8 @@
" \"\"\"\n",
" \n",
" for page in self.pages:\n",
" self.products += each_page.find_all(\"div\", {\"class\": re.compile(r\"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)\")}) \n",
" class_value = re.compile(r\"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)|(?:^sku -gallery.+$)\")\n",
" self.products += each_page.find_all(\"div\", {\"class\": class_value}) \n",
" \n",
" def get_links(self):\n",
" \"\"\"\n",
Expand Down Expand Up @@ -163,8 +164,17 @@
" It returns a list of the product ratings\n",
" \n",
" \"\"\"\n",
" \n",
" for product in self.products:\n",
" if each.find(\"div\", {\"class\": \"stars\"}) == None:\n",
" self.ratings.append(\"No Rating\")\n",
" else:\n",
" rating_tag = each.find(\"div\", {\"class\": \"stars\"})[\"style\"]\n",
" rating_re = re.compile(r\"[0-9]+\")\n",
" stars = format(int(rating_re.findall(rating_tag)[0])/100 * 5, '.2f')\n",
" self.ratings.append(stars)\n",
" \n",
" def get_sales():\n",
" def get_rated_sales():\n",
" \"\"\"\n",
" \n",
" It takes in a list of html pages already parsed by the get_pages method\n",
Expand All @@ -175,12 +185,18 @@
" \n",
" \"\"\"\n",
"\n",
" "
" for product in self.products:\n",
" if product.find(\"div\", {\"class\": \"total-ratings\"}) == None:\n",
" self.rated_sales.append(\"No Rated Sales\")\n",
" else:\n",
" rated_sales_re = re.compile(r\"[0-9]+\")\n",
" dirty_sales = each.find(\"div\", {\"class\": \"total-ratings\"}).text\n",
" print(rated_sales_re.findall(dirty_sales)[0])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -189,27 +205,69 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": 30,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"'10,510'"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"379\n",
"No Rated Sales\n",
"No Rated Sales\n",
"122\n",
"29\n",
"171\n",
"3\n",
"146\n",
"226\n",
"182\n",
"45\n",
"523\n",
"93\n",
"214\n",
"65\n",
"805\n",
"55\n",
"51\n",
"No Rated Sales\n",
"132\n",
"78\n",
"8\n",
"2\n",
"8\n",
"798\n",
"6\n",
"22\n",
"35\n",
"3\n",
"No Rated Sales\n",
"2\n",
"No Rated Sales\n",
"28\n",
"16\n",
"61\n",
"No Rated Sales\n",
"69\n",
"25\n",
"13\n",
"477\n"
]
}
],
"source": [
"ade2 = ade.content\n",
"ade_html = bso(ade2, \"html.parser\").find(\"section\", {\"class\":\"products -mabaya\"})\n",
"ade3 = ade_html.find_all(\"div\", {\"class\": re.compile(r\"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)\")})\n",
"ade3[0].find(\"span\", {\"data-price\": re.compile(r\"\\d+\")}).text\n",
"ade3 = ade_html.find_all(\"div\", {\"class\": re.compile(r\"(?:^mabaya sku -gallery$)|(?:^sku -gallery$)|(?:^sku -gallery.+$)\")})\n",
"for each in ade3:\n",
" if each.find(\"div\", {\"class\": \"total-ratings\"}) == None:\n",
" print(\"No Rated Sales\")\n",
" else:\n",
" rated_sales_re = re.compile(r\"[0-9]+\")\n",
" dirty_sales = each.find(\"div\", {\"class\": \"total-ratings\"}).text\n",
" print(rated_sales_re.findall(dirty_sales)[0])\n",
"#.find(\"span\", {\"class\": \"price \"}).get_text()\n",
"#.find(\"span\", {\"data-price\": re.compile(r\"\\d+\")}).text"
]
Expand Down Expand Up @@ -242,6 +300,131 @@
"print(requests.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['__abs__',\n",
" '__add__',\n",
" '__bool__',\n",
" '__class__',\n",
" '__delattr__',\n",
" '__dir__',\n",
" '__divmod__',\n",
" '__doc__',\n",
" '__eq__',\n",
" '__float__',\n",
" '__floordiv__',\n",
" '__format__',\n",
" '__ge__',\n",
" '__getattribute__',\n",
" '__getformat__',\n",
" '__getnewargs__',\n",
" '__gt__',\n",
" '__hash__',\n",
" '__init__',\n",
" '__int__',\n",
" '__le__',\n",
" '__lt__',\n",
" '__mod__',\n",
" '__mul__',\n",
" '__ne__',\n",
" '__neg__',\n",
" '__new__',\n",
" '__pos__',\n",
" '__pow__',\n",
" '__radd__',\n",
" '__rdivmod__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__rfloordiv__',\n",
" '__rmod__',\n",
" '__rmul__',\n",
" '__round__',\n",
" '__rpow__',\n",
" '__rsub__',\n",
" '__rtruediv__',\n",
" '__setattr__',\n",
" '__setformat__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__sub__',\n",
" '__subclasshook__',\n",
" '__truediv__',\n",
" '__trunc__',\n",
" 'as_integer_ratio',\n",
" 'conjugate',\n",
" 'fromhex',\n",
" 'hex',\n",
" 'imag',\n",
" 'is_integer',\n",
" 'real']"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir(float)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"asd = \"(234)\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "Can't convert 'list' object to str implicitly",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-14-10299fe5d909>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0masf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0masd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"(\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\")\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m: Can't convert 'list' object to str implicitly"
]
}
],
"source": [
"asf = asd.replace([\"(\",\")\"], \"\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'234)'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"asf"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down

0 comments on commit d6efec4

Please sign in to comment.