Skip to content

텍스트만 수집

Chan-Yub Park edited this page Jun 22, 2017 · 2 revisions
install.packages("selectr")
library(curl)
library(rvest)
devtools::install_github("forkonlp/N2H4")
library(N2H4)

options(stringsAsFactors = F)

success <- function(res){
  cat("Request done! Status:", res$status, "\n")
  res$content<-iconv(rawToChar(res$content),from="CP949",to="UTF-8")
  data <<- c(data, list(res))
}
failure <- function(msg){
  cat("Oh noes! Request failed!", msg, "\n")
}


cate<-getMainCategory()

subcate<-lapply(cate[,2], getSubCategory)

scate<-c()
for(i in 1:length(subcate)){
  scate<-rbind(scate, data.frame(cate_name=cate[i,1],sid1=cate[i,2],subcate[[i]]))
}

strDate<-as.Date("2017-01-01")
endDate<-as.Date("2010-03-31")

strTime<-Sys.time()
midTime<-Sys.time()

for (date in strDate:endDate){
  date<-gsub("-","",as.character(as.Date(date,origin = "1970-01-01")))
  for (i in 1:nrow(scate)){
    print(paste0(date," / ",scate[i,1] ," - ",scate[i,3]," / start Time: ", strTime," / spent Time: ", Sys.time()-midTime," / spent Time at first: ", Sys.time()-strTime))
    midTime<-Sys.time()
    
    pageUrli<-paste0("http://news.naver.com/main/list.nhn?sid2=",scate[i,4],"&sid1=",scate[i,2],"&mid=shm&mode=LS2D&date=",date)
    trym<-0
    max<-try(getMaxPageNum(pageUrli), silent = T)
    while(trym<=5&&class(max)=="try-error"){
      max<-try(getMaxPageNum(pageUrli), silent = T)
      Sys.sleep(abs(rnorm(1)))
      trym<-trym+1
      print(paste0("try again max num: ",pageUrli))
    }
    closeAllConnections()
    for (pageNum in 1:max){
      print(paste0(date," / ",scate[i,1]," / ",scate[i,3]," / ",pageNum, " / start Time: ", strTime," / spent Time: ", Sys.time()-midTime," / spent Time at first: ", Sys.time()-strTime))
      midTime<-Sys.time()
      pageUrl<-paste0(pageUrli,"&page=",pageNum)
      tryp<-0
      newsList<-try(getUrlListByCategory(pageUrl), silent = T)
      while(tryp<=5&&class(newsList)=="try-error"){
        newsList<-try(getUrlListByCategory(pageUrl), silent = T)
        Sys.sleep(abs(rnorm(1)))
        tryp<-tryp+1
        print(paste0("try again max num: ",pageUrl))
      }

      if(nrow(newsList)==0){
        print(paste0("no news link: ", pageUrl))
        next
      }

      closeAllConnections()
      pool <- new_pool()
      data <- list()
      sapply(newsList$links, function(x) curl_fetch_multi(x,success,failure))
      res <- multi_run()
      
      if( identical(data, list()) ){
        res <- multi_run()
      }
      
      loc<-sapply(data, function(x) grepl("^http://news.naver",x$url))
      cont<-sapply(data, function(x) x$content)
      cont<-cont[loc]
      
      titles<-unlist(lapply(cont,function(x) getContentTitle(read_html(x))))
      bodies<-unlist(lapply(cont,function(x) getContentBody(read_html(x))))
#      presses<-unlist(lapply(cont,function(x) getContentPress(read_html(x))))
#      data<-data.frame(title=titles,body=bodies)
      
      dir.create("./data",showWarnings=F)
      dir.create(paste0("./data/cate_",scate[i,4]),showWarnings=F)
      write.table(titles, file=paste0("./data/cate_",scate[i,4],"/news",scate[i,2],"_",scate[i,4],"_",date,"_",pageNum,"_title.csv"),row.names = F,col.names = F)
      write.table(bodies, file=paste0("./data/cate_",scate[i,4],"/news",scate[i,2],"_",scate[i,4],"_",date,"_",pageNum,"_body.csv"),row.names = F,col.names = F)
      closeAllConnections()
      
      print("get comment")
      
      urls<-newsList$links
      
      dir.create(paste0("./data/news_comments/"),showWarnings=F)
      
      for(getCom in 1:length(urls)){
        
        fortit<-strsplit(urls[getCom],"=")[[1]][7]
        
        trycom<-0
        comDat<-try(getComment(urls[getCom]), silent = T)
        while(trycom<=5&&class(comDat)=="try-error"){
          comDat<-try(getComment(urls[getCom]), silent = T)
          Sys.sleep(abs(rnorm(1)))
          trycom<-trycom+1
          print(paste0("try again comment num: ", newsList$links[getCom]))
        }
        
        if(trycom==6){ next }
        
        cnt<-comDat$result$count$comment
      
        print(paste0("comment count: ",cnt))
      
        if(cnt==0){ next }
      if(cnt>100){
        pn<-round(cnt/100,0)+1
        comDat<-c()
        for(PN in 1:pn){
          tryt<-0
          tem<-try(getComment(urls[getCom],pageSize = 100,page = PN), silent = T)
          while(tryt<=5&&class(tem)=="try-error"){
            tem<-try(getComment(urls[getCom],pageSize = 100,page = PN), silent = T)
            Sys.sleep(abs(rnorm(1)))
            tryt<-tryt+1
            print(paste0("try again comment: ",pageUrli))
          }
          closeAllConnections()
          
          tem<-as.data.frame(tem$result$commentList)
          if(nrow(tem)!=0){
            tem<-tem[,c("contents")]
            comDat<-c(comDat,tem)
          }
        }
        datC<-comDat
      }
      if(cnt<=100&cnt>0){
        tryt<-0
        tem<-try(getComment(urls[getCom],pageSize = 100,page = 1), silent = T)
        while(tryt<=5&&class(tem)=="try-error"){
          tem<-try(getComment(urls[getCom],pageSize = 100,page = 1), silent = T)
          Sys.sleep(abs(rnorm(1)))
          tryt<-tryt+1
          print(paste0("try again comment: ",pageUrli))
        }
        tem<-as.data.frame(tem$result$commentList)
        datC<-tem[,c("contents")]
      }
      
      write.table(datC, file=paste0("./data/news_comments/news_",date,"_",fortit,"_comments.csv"),row.names = F,col.names = F)
      }
    }
  }
}