-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgtext.R
71 lines (62 loc) · 3.29 KB
/
gtext.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#-----------------------------------------------------------------------------
# gtext function takes a search string then plugs it into the google search
# engine, and returns text scraped from the top three hits. This is a work in
# progress, so results are not guaranteed
#-----------------------------------------------------------------------------
# get libraries
library(RCurl)
library(XML)
library(stringr)
source("htmlToText.R")
gtext <- function(search.string, link.threshold){
# grab google search results
search.results <- getForm("http://www.google.com/search", hl="en",
lr="", q = search.string, btnG="Search")
# if you wan to parse this into a tree:
# search.results.tree <- htmlTreeParse(search.results, useInternal = TRUE); rm(site)
# take out the top links from the search result
# define the regex to get the links:
# 2012/03/06 - need a new mask to get the scraping right on google. did
# they change how results are returned?
# [DEPRECATED?] reg1 <- "(?<=hlprwt\\(this\\,\\s\').+?(?=\'\\))"
reg1 <- "(?<=\\/url\\?q\\=).+?(?=\\&)"
# "/url?q=" is the lead tag, "&" is the end string. note that
# all "\\" are escapes for perl based regex
# extract weblinks from the google search page - use perl regex
# create a matrix that stores the location information for the URLs
site.loc <- gregexpr(reg1,search.results, perl = T)
site.mat <- cbind(site.loc[[1]], unlist(attributes(site.loc[[1]])))
row.names(site.mat) <- NULL
colnames(site.mat) <- c("loc","length")
# create a storage vector for the URLs
url.stor <- matrix(data = NA, nrow = link.threshold, ncol = 2)
for(i in 1:link.threshold){
url.stor[i,1] <- substring(search.results,
site.mat[i,"loc"],site.mat[i,"loc"] + site.mat[i,"length"] - 1)
}
# take out all .pdf references
idx.keep.pdf <- grep('.pdf', url.stor[,1], perl = T, invert = T)
url.stor <- as.matrix(url.stor[idx.keep.pdf,])
# grab the text content from top [link.thredhold] URLS
max.len <- length(url.stor[!is.na(url.stor) ==T])
url.stor2 <- matrix(data = "",nrow = 5, ncol = 1)
# create a matrix to store the results in - 5 units long
for(i in 1:max.len){
# get the text content from the URLs (top [link.thresdhold])
url.stor2[i] <- htmlToText(url.stor[i,1])
# drop the returns
url.stor2[i] <- gsub("[\n\t\r]","",url.stor2[i])
}
# create a single record for that search, with all the text from the top
# [link.thresdhold] from the search.string
text <- paste(url.stor2[1], url.stor2[2], url.stor2[3],
url.stor2[4], url.stor2[5], sep = " ")
# this was hand hard coded - okay, since you don't have to be 5
# clicks long, as long as it is 5 or shorter.
# HOW DO YOU USE PASTE HERE SUCH THAT IT DOES NOT HAVE TO BE HARD
# CODED LIKE THIS?
output <- cbind(max.len, text)
# takes the number of links used and the actual text extracted from
# that
return(output)
}