-
Notifications
You must be signed in to change notification settings - Fork 0
/
proj1.r
150 lines (128 loc) · 6.29 KB
/
proj1.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
## Ziwen Zhong S2326022
## Wenzheng Zhang S2310185
## Tianqi Dai S2302524
## Contributions: This file is group project 1 from Statistical Programming Group 1.
## When project 1 was released,we discussed our opinions about the project and generalized
## a guideline of our main idea and instructions for coding we would use. Ziwen, as our group leader,
## allocated the schedule of the group discussion. We gathered after working from home and spoke about our own opinions.
## After making sure the questions are perfectly solved and everyone fully understands the command,
## we put our work together. We use Tianqi’s step3 and step4, Wenzheng’s step5 and step6,
## and Ziwen’s step7 and step8. For steps 9 and 10, we asked our questions about the ambiguous part to the lecturer.
## We proposed our suggestions for the coding procedure and Ziwen decided on the most efficient and effective version.
## After finishing the code, Tianqi tested the code and spotted some misunderstandings in the instruction, and we fixed them together.
## Wenzheng wrote a rough note to explain every command, and we wrote the final comments together.
## Ziwen always has so many outstanding and creative ideas, he provided the team member with support and always inspire the team to attempt further and deeper.
## This group project 1 was finished with good communication and collaborations and uploaded to Github and Learn on 06/10/2022
####################################################################
## steep 3
## Read the file into R
# setwd("")
a <- scan("pg10.txt", what="character", skip=104) ## skip contents
n <- length(a)
a <- a[-((n-2886):n)] ## strip license
a <- a[-grep("[0123456789]:[0123456789]", a)] ## strip out verse numbers
a
## step4
## pre-processing: definition of the function split_punct
split_punct <- function(a){
ii <- grep('[,.;!:?]', a) ## locate the word with punctuation
xs <- rep("", length(a)+length(ii)) ## create a space to store the final words and punctuation
iis <- ii + 1:length(ii) ## calculate the location of the punctuation
xs[iis] <- substr(a[ii], nchar(a[ii]), nchar(a[ii])) ## fill the space with punctuation
xs[-iis] <- gsub('[,.;!:?]', '', a) ## fill the rest with word
return(xs) ## return the final vector which is word-punctuation apart
}
## step 5
## use split_punct function to process a
a <- split_punct(a)
## step 6
## create b
a <- tolower(a) ## lowercase the vector a
b <- unique(a) ## find the vector b of unique words in the bible text
e <- match(a, b) ## find the index of which element in b each element of a corresponds to
fre <- tabulate(e) ## calculate the frequency of every unique word in a
thre_num <- sort(fre, decreasing=T)[500] ## arrange the frequency vector to a decreasing order to find the threshold number
b <- b[which(fre>=thre_num)] ## output about 500 most common words, store in vector b
## step 7
## generate T array
e <- match(a, b) ## find the index of which element in b each element of a corresponds to
t <- cbind(c(NA,NA,e),c(NA,e,NA),c(e,NA,NA)) ## combine 3 vectors, each vector followed by the same vector shifted by one place
MT <- array(0,c(length(b),length(b),length(b))) ## initialize an empty MT array to count
t <- t[rowSums(is.na(t))==0,] ## drop the word triplets that contain NA
for (index in 1:nrow(t)){
## loop through each row of word triples, count its frequency by add 1 to MT[i, k, j] every time [i, k, j] shows up.
i <- t[index,1]
k <- t[index,2]
j <- t[index,3]
MT[i,k,j] <- MT[i,k,j] + 1
}
## loop through each vector [i,k,], divide the sum of the vector to obtain the probability
for (index1 in 1:length(b)){
for (index2 in 1:length(b)) {
if (sum(MT[index1,index2,])!=0){
MT[index1,index2,] <- MT[index1,index2,]/sum(MT[index1,index2,])
}
}
}
## Generate A array
e <- match(a, b) ## translate vector of words to vector of index corresponds to b.
t <- cbind(c(NA,e),c(e,NA)) ## combine 2 vectors, the first vector followed by the same vector shifted by one place
MA <- array(0,c(length(b),length(b))) ## initialize an empty MA array to count
t <- t[rowSums(is.na(t))==0,] ## drop the word doubles that contain NA
## loop through each row of word doubles, count its frequency by add 1 to MA[i, k] every time [i, k] shows up.
for (index in 1:nrow(t)){
i <- t[index,1]
k <- t[index,2]
MA[i,k] <- MA[i,k] + 1
}
## loop through each vector [i,], divide the sum of the vector to obtain the probability
for (index1 in 1:length(b)){
if (sum(MA[index1,])!=0){
MA[index1,] <- MA[index1,]/sum(MA[index1,])
}
}
## Generate vector S
e <- match(a, b)
S <- tabulate(e)
S <- S/sum(S) ## translate the frequency to probabilities
## step 8
## Word Generation
firstWord <- sample(1:length(b), size=1, prob=S) ## generate the first word where the word probabilities are simply taken from S
## generate the second word where the word probabilities are taken from A[word1,] as the first word were given
## if fail to do so, falling back to S
if (sum(MA[firstWord,])!=0){
secondWord <- sample(1:length(b), size=1, prob=MA[firstWord,])
} else{
secondWord <- sample(1:length(b), size=1, prob=S)
}
wl <- c(firstWord, secondWord) ## store the two words in vector wl
## as two words were given, use the word probabilities in T[word1, word2,], generate the 3rd word,
## if fail to do so, try to generate the 3rd word using the word probabilities in A[word2,],
## if all failed, falling back to S,
## add the 3rd word to vector wl, and move on, use 2nd and 3rd word to generate the 4th one,
## keep going, until there are 50 word in vector wl.
for (i in 3:50){
first <- wl[i-2]
second <- wl[i-1]
if (sum(MT[first,second,])!=0){
nextWord <- sample(1:length(b), size=1, prob=MT[first,second,])
} else if (sum(MA[second,])!=0){
nextWord <- sample(1:length(b), size=1, prob=MA[second,])
} else{
nextWord <- sample(1:length(b), size=1, prob=S)
}
wl <- c(wl, nextWord)
}
wordlist1 <- b[wl] ## translate indexes from b to words.
## step 9
## simulate 50-word sections of text where the word probabilities are simply taken from S
wl2 <- c()
for (i in 1:50){
word <- sample(1:length(b), size=1, prob=S)
wl2 <- c(wl2,word)
}
wordlist2 <- b[wl2]
## show both result
wordlist1
wordlist2
## step 10