-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.Rhistory
77 lines (77 loc) · 3.25 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
setwd("Dropbox/text")
setwd("Dropbox/text/")
setwd("/Dropbox/text/")
setwd("Dropbox")
setwd("../Dropbox/text")
dir.files()
files.dir()
list.files()
load("bus5.RData")
search.results
ls
ls()
search.string
site.mat
getwd()
setwd("../../github/local/gtext")
source("gtext.R")
search.string <- "eric lin"
search.results <- getForm("http://www.google.com/search", hl="en",#
lr="", q = search.string, btnG="Search")#
# if you wan to parse this into a tree:#
# search.results.tree <- htmlTreeParse(search.results, useInternal = TRUE); rm(site)#
#
# take out the top links from the search result#
# define the regex to get the links:#
# 2012/03/06 - need a new mask to get the scraping right on google. did#
# they change how results are returned?#
# [DEPRECATED?] reg1 <- "(?<=hlprwt\\(this\\,\\s\').+?(?=\'\\))"#
reg1 <- "(?<=\\/url\\?q\\=).+?(?=\\&)"#
# "/url?q=" is the lead tag, "&" is the end string. note that#
# all "\\" are escapes for perl based regex#
#
# extract weblinks from the google search page - use perl regex#
# create a matrix that stores the location information for the URLs#
site.loc <- gregexpr(reg1,search.results, perl = T)#
site.mat <- cbind(site.loc[[1]], unlist(attributes(site.loc[[1]])))#
row.names(site.mat) <- NULL#
colnames(site.mat) <- c("loc","length")#
# create a storage vector for the URLs#
url.stor <- matrix(data = NA, nrow = link.threshold, ncol = 2)#
for(i in 1:link.threshold){#
url.stor[i,1] <- substring(search.results,#
site.mat[i,"loc"],site.mat[i,"loc"] + site.mat[i,"length"] - 1)#
}#
#
# take out all .pdf references#
idx.keep.pdf <- grep('.pdf', url.stor[,1], perl = T, invert = T)#
url.stor <- as.matrix(url.stor[idx.keep.pdf,])
url.stor
max.len <- length(url.stor[!is.na(url.stor) ==T]) #
url.stor2 <- matrix(data = "",nrow = 5, ncol = 1)#
# create a matrix to store the results in - 5 units long#
for(i in 1:max.len){#
# get the text content from the URLs (top [link.thresdhold])#
url.stor2[i] <- htmlToText(url.stor[i,1])#
# drop the returns#
# url.stor2[i] <- gsub("[\n\t\r]","",url.stor2[i])#
}
url.stor2 <- matrix(data = "",nrow = 5, ncol = 1)#
# create a matrix to store the results in - 5 units long#
for(i in 1:max.len){#
# get the text content from the URLs (top [link.thresdhold])#
url.stor2[i] <- htmlToText(url.stor[i,1])#
# drop the returns#
url.stor2[i] <- gsub("[\n\t\r]","",url.stor2[i])#
}
url.stor2 <- matrix(data = "",nrow = 5, ncol = 1)#
# create a matrix to store the results in - 5 units long#
for(i in 1:max.len){#
# get the text content from the URLs (top [link.thresdhold])#
url.stor2[i] <- htmlToText(url.stor[i,1])#
# drop the returns#
# url.stor2[i] <- gsub("[\n\t\r]","",url.stor2[i])#
}
url.stor2
:q()
q()