Skip to content

Commit

Permalink
unicode ascii problem fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
emilyxxie committed Dec 9, 2016
1 parent e986f7d commit 1c53ce6
Show file tree
Hide file tree
Showing 6 changed files with 324 additions and 36 deletions.
29 changes: 29 additions & 0 deletions lib/additional_diseases.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
African_trypanosomiasis
Lymphatic_filariasis
Japanese_encephalitis
Dracunculiasis
Filariasis
Soil-transmitted_helminthiasis
Helminthiasis
Schistosomiasis
Leishmaniasis
Paragonimiasis
Opisthorchiasis
Echinococcosis
Onchocerciasis
Chagas_disease => American trypanosomiasis
Schistosoma_bovis
Spondweni_fever
Lobomycosis
Chikungunya
Tropical_eosinophilia
Creutzfeldt-Jakob-disease
Fatal_familial_insomnia
Cryptococcosis
Granulomatous_amoebic_encephalitis
Aspergillosis
Tularemia
Eastern_equine_encephalitis_virus
Intestinal_capillariasis
Venezuelan_equine_encephalitis_virus
Middle_East_respiratory_syndrome
223 changes: 223 additions & 0 deletions lib/additional_symptoms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
cachexia
loss of appetite
weight loss
weight gain
dry mouth
fatigue
malaise
asthenia
muscle weakness
pyrexia
jaundice
pain
abdominal pain
chest pain
bruising
epistaxis
tremor
convulsions
muscle cramps
tinnitus
dizziness
vertigo
syncope
hypothermia
hyperthermia
discharge
bleeding
swelling
deformity
sweats
chills
shivering
acalculia
acrophobia
agnosia
agoraphobia
akathisia
akinesia
alexia
amusia
anhedonia
anomia
anosognosia
anxiety
apraxia
arachnophobia
ataxia
bradykinesia
cataplexy
chorea
claustrophobia
confusion
depression
dysarthria
dysgraphia
dystonia
euphoria
hallucination
headache
homicidal ideation
insomnia
mania
paralysis
paranoia
paresthesia
phobia
somnolence
tic
tremor
blurred vision
double vision
exophthalmos
nystagmus
anorexia
bloating
belching
blood in stool
constipation
diarrhea
dysphagia
dyspepsia
flatulence
fecal incontinence
haematemesis
nausea
odynophagia
pyrosis
rectal malodor
vomiting
cardiovascular pain
chest pain
claudication
palpitation
tachycardia
bradycardia
arrhythmia
dysuria
hematuria
impotence
polyuria
retrograde ejaculation
urinary frequency
urinary incontinence
urinary retention
hypoventilation
hyperventilation
sleep apnea
apnea
cough
dyspnea
hemoptysis
chest pain
sputum production
tachypnea
abrasion
alopecia
anasarca
blister
edema
hirsutism
itching
laceration
paresthesia
rash
urticaria
dyspareunia
painful intercourse
pelvic pain
infertility
pain
vaginal bleeding
post-prandial abdominal pain
trouble breathing
dehydration
pallor
facial flushing
low blood pressure
excessive thirst
constipation
acid reflux
visual disturbance
headache
nerve pain
muscle cramp
fainting
scoliosis
pain
stabbing pain
chronic pain
loss of vision
muscle weakness
myopathy
waddling gait
difficulty gaining weight
aching
pressure sensation
facial pain
tooth pain
poor appetite
vomiting
poor growth
irritability
frequent fevers
frequent infection
jaundice
yellowing of the skin
itchiness
poor absorption of nutrients
pale stools
dark urine
swollen abdomen
bone fragility
seizure
irregular heartbeat
breathing problems
thin hair
brittle nails
muscle pain
mottled skin
myalgia
unstable joints
bleeding
gastrointestinal discomfort
malabsorption of food
decreased motility
hearing loss
gastrointestinal issues
twitching
twitch
drooling
lethargy
malaise
sleep disturbances
skeletal malformations
skeletal malformation
hyperactivity
aggression
loss of inhibition
inflammation
oral lesions
painful lesions
eye pain
blurred vision
red eye
stiff neck
difficulty speaking
memory loss
loss of memory
visual disturbances
blindness
deafness
fever
winterbottom's sign
chancre
aggressive behavior
loss of mobility
enlarged spleen
impaired vision
bone deformity
bone erosion
intracranial pressure
breathlessness
loss of speech
88 changes: 57 additions & 31 deletions lib/disease_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,49 +2,75 @@
import json
import sys
import re
import string
import urllib2
from bs4 import BeautifulSoup
import pdb

disease = wikipedia.page('Cerebral_salt-wasting_syndrome')
# rare_diseases = "https://en.wikipedia.org/w/index.php?title=Category:Rare_diseases"

regex = r"<a.*?>(.*?)<\/a>"
data = open('symptoms_original.txt', 'r').read()
matches = re.findall(regex, data)

symptoms_regex = ""
group_count = 0
page_name = "https://en.wikipedia.org/w/index.php?title=Category:Rare_diseases&from="
# page_name = ""
links = set()

symptoms_array = set()
complete_symptom_list = set()
for c in string.ascii_lowercase:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')] #wikipedia needs this

for match in matches:
if "(" in match:
in_parens = match[match.find("(") + 1: match.find(")")].lower().strip()
if in_parens not in symptoms_array:
symptoms_array.add(in_parens.lower())
group_count += 1
match = match[:match.find("(")]
symptoms_array.add(match.lower().strip())
group_count += 1
resource = opener.open(page_name + c)

print(page_name + c)

if group_count == 100:
# we must batch the regex match since there is a 100 group count limit
symptoms_array = list(symptoms_array)
symptoms_regex = "|".join(symptoms_array)
sd_matches = re.findall(r'%s' % symptoms_regex, disease.content)
data = resource.read()
resource.close()
soup = BeautifulSoup(data, "html.parser")

for sd_match in sd_matches:
sd_match = sd_match.encode(sys.getdefaultencoding())
complete_symptom_list.add(sd_match)
links |= set(map(lambda t: t.get("href")[6:].encode(sys.getdefaultencoding()),
soup.find(id="mw-pages").find(class_="mw-category").find_all("a")))

symptoms_array = set()
symptoms_regex = ""
group_count = 0

replacers = [('%E2%80%93', '_'), ('%C3%A9', 'e'), ('%C3%B6', 'o')]
print(len(links))
for link in links:
print 'fetching', link
try:
for r, s in replacers:
link = link.replace(r, s)
page = wikipedia.page(link)
except wikipedia.exceptions.PageError:
print 'bad link:', link

print(complete_symptom_list)
# print(page.content)

# regex = r"<a.*?>(.*?)<\/a>"
# symptoms = json.loads(open('../symptoms.js', 'r').read())
# symptoms_regex = ""
# group_count = 0

# symptoms_array = set()
# complete_symptom_list = set()

# symptoms_regex = "(leaking urine)|(hair loss)|(dizziness)|(weakness)|(vertigo)"
# for symptom in symptoms['symptoms']:
# group_count += 1
# symptoms_array.add(
# symptom.encode(sys.getdefaultencoding())
# )

# for
# if group_count == 100:
# # we must batch the regex match since there is a 100 group count limit
# symptoms_array = list(symptoms_array)
# symptoms_regex = "|".join(symptoms_array)
# sd_matches = re.findall(r'%s' % symptoms_regex, disease.content, re.IGNORECASE)

# for sd_match in sd_matches:
# sd_match = sd_match.encode(sys.getdefaultencoding()).lower()
# complete_symptom_list.add(sd_match)

# symptoms_array = set()
# symptoms_regex = ""
# group_count = 0



# print(complete_symptom_list)
10 changes: 10 additions & 0 deletions lib/remove_symptoms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
remove from symptoms:
- Crying baby
- female
- frequency



remove from diseases:
- ALS?
- cystic fibrosis
1 change: 1 addition & 0 deletions lib/symptoms_original.txt

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions lib/symptoms_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
data = open('symptoms_original.txt', 'r').read()
matches = re.findall(regex, data)

symptoms = symptom()
# for match in matches:
# symptoms_json['symptoms'].append(match)
symptoms = set()

for match in matches:
if "(" in match:
Expand All @@ -15,8 +13,9 @@
match = match[:match.find("(")]
symptoms.add(match.lower().strip())


symptoms_json = {'symptoms' : []}
data2 = open('additional_symptoms.txt', 'r').read().splitlines()
symptoms = symptoms.union(set(data2))
symptoms_json = {'symptoms' : list(symptoms)}

import io, json
with io.open('../symptoms.js', 'w', encoding='utf-8') as f:
Expand Down

0 comments on commit 1c53ce6

Please sign in to comment.