From 8e24cce4e68311225304e743cd06e7a6d9eae159 Mon Sep 17 00:00:00 2001 From: Kiese Diangebeni Reagan <48117141+Rekidiang2@users.noreply.github.com> Date: Sat, 31 Dec 2022 09:42:20 +0100 Subject: [PATCH 1/3] Update 1-Data-Cleaning.ipynb --- 1-Data-Cleaning.ipynb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/1-Data-Cleaning.ipynb b/1-Data-Cleaning.ipynb index 2689063..24f3685 100644 --- a/1-Data-Cleaning.ipynb +++ b/1-Data-Cleaning.ipynb @@ -80,7 +80,9 @@ " '''Returns transcript data specifically from scrapsfromtheloft.com.'''\n", " page = requests.get(url).text\n", " soup = BeautifulSoup(page, \"lxml\")\n", - " text = [p.text for p in soup.find(class_=\"ast-container\").find_all('p')]\n", + " #text = [p.text for p in soup.find(class_=\"ast-container\").find_all('p')]\n", + " # page html code was modified this line of code can be formated like this\n", + " text = [p.text for p in soup.find_all('p')]\n", " print(url)\n", " return text\n", "\n", From fd672fc28de3a0d69bf6749348211c6f15ebee56 Mon Sep 17 00:00:00 2001 From: Ahtisham-1214 Date: Fri, 10 Oct 2025 09:55:08 +0500 Subject: [PATCH 2/3] Resolve issue #21: now using proper regex format to resolve syntax warning --- 1-Data-Cleaning.ipynb | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/1-Data-Cleaning.ipynb b/1-Data-Cleaning.ipynb index 2689063..e33babe 100644 --- a/1-Data-Cleaning.ipynb +++ b/1-Data-Cleaning.ipynb @@ -132,11 +132,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": true }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'comedians' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Load pickled files\u001b[39;00m\n\u001b[0;32m 2\u001b[0m data \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, c \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[43mcomedians\u001b[49m):\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtranscripts/\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m c \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[0;32m 5\u001b[0m data[c] \u001b[38;5;241m=\u001b[39m pickle\u001b[38;5;241m.\u001b[39mload(file)\n", + "\u001b[1;31mNameError\u001b[0m: name 'comedians' is not defined" + ] + } + ], "source": [ "# Load pickled files\n", "data = {}\n", @@ -284,9 +296,9 @@ "def clean_text_round1(text):\n", " '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''\n", " text = text.lower()\n", - " text = re.sub('\\[.*?\\]', '', text)\n", - " text = re.sub('[%s]' % re.escape(string.punctuation), '', text)\n", - " text = re.sub('\\w*\\d\\w*', '', text)\n", + " text = re.sub(r'\\[.*?\\]', '', text)\n", + " text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)\n", + " text = re.sub(r'\\w*\\d\\w*', '', text)\n", " return text\n", "\n", "round1 = lambda x: clean_text_round1(x)" @@ -508,7 +520,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.12.2" }, "toc": { "nav_menu": {}, From 2a92d8f688ae1c1439822a3548a7451c67e2f329 Mon Sep 17 00:00:00 2001 From: Ahtisham-1214 Date: Fri, 10 Oct 2025 10:01:49 +0500 Subject: [PATCH 3/3] Resolve issue #21: now using proper regex format to resolve syntax warning --- 1-Data-Cleaning.ipynb | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/1-Data-Cleaning.ipynb b/1-Data-Cleaning.ipynb index e33babe..9600a52 100644 --- a/1-Data-Cleaning.ipynb +++ b/1-Data-Cleaning.ipynb @@ -132,23 +132,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": true }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'comedians' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Load pickled files\u001b[39;00m\n\u001b[0;32m 2\u001b[0m data \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, c \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[43mcomedians\u001b[49m):\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtranscripts/\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m c \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[0;32m 5\u001b[0m data[c] \u001b[38;5;241m=\u001b[39m pickle\u001b[38;5;241m.\u001b[39mload(file)\n", - "\u001b[1;31mNameError\u001b[0m: name 'comedians' is not defined" - ] - } - ], + "outputs": [], "source": [ "# Load pickled files\n", "data = {}\n",