For this the radius can be in cm/inches (whichever is better), it reflects the number of visits per 10000000 (which is inversed to reflect a larger radius for circles). The bigger the radius the more visits per 10 milion people.
citydata<-read_html("https://en.wikipedia.org/wiki/List_of_Chinese_administrative_divisions_by_population")
data<-citydata%>%
html_nodes(xpath="//*[@id='mw-context-text']/div/table[3]")%>%
html_table()
url<-"https://en.wikipedia.org/wiki/List_of_Chinese_administrative_divisions_by_population"
citydata<-read_html(url)
pagename_head<-'//*[@id="mw-content-text"]/div/table'
table_head<-html_nodes(citydata,xpath=pagename_head)[3]
a<-table_head%>%
html_table()
provincepopulation<-as.data.frame(a)
provincepopulation<-provincepopulation[,c(1,2)]
provincepopulation$Administrative.Division<-str_replace_all(provincepopulation$Administrative.Division, "[[:punct:]]", " ")
str_extract(provincepopulation$Administrative.Division, "[^_]+")
## [1] "China"
## [2] "Guangdong"
## [3] "Shandong"
## [4] "Henan"
## [5] "Sichuan"
## [6] "Jiangsu"
## [7] "Hebei"
## [8] "Hunan"
## [9] "Anhui"
## [10] "Hubei"
## [11] "Zhejiang"
## [12] "Guangxi FN 12 "
## [13] "Yunnan"
## [14] "Jiangxi"
## [15] "Liaoning"
## [16] "Fujian"
## [17] "Shaanxi"
## [18] "Heilongjiang"
## [19] "Shanxi"
## [20] "Guizhou"
## [21] "Chongqing FN 19 "
## [22] "Jilin"
## [23] "Gansu"
## [24] "Inner Mongolia FN 12 "
## [25] "Xinjiang FN 12 "
## [26] "Shanghai< FN 19 "
## [27] "Beijing FN 19 "
## [28] "Tianjin"
## [29] "Hainan"
## [30] "Hong Kong FN 22 "
## [31] "Ningxia FN 12 "
## [32] "Qinghai"
## [33] "Tibet"
## [34] "Macau FN 23 "
## [35] "Republic of China Taiwan FN 39 "
provincepopulation$Administrative.Division<-str_extract(provincepopulation$Administrative.Division, '\\w*')
provincepopulation$Administrative.Division<-str_replace_all(provincepopulation$Administrative.Division,"Republic","Taiwan")
provincepopulation$Administrative.Division<-str_replace_all(provincepopulation$Administrative.Division,"Hong","Hong Kong")
provincepopulation$Administrative.Division<-str_replace_all(provincepopulation$Administrative.Division,"Inner","Inner Mongolia")
provincepopulation<-provincepopulation[-1,]
provincepopulation$Administrative.Division<-as.data.frame(tolower(provincepopulation$Administrative.Division))
xi_provincecount<-read_csv("xi_provincecount.csv")
## Parsed with column specification:
## cols(
## province = col_character(),
## n = col_double()
## )
xi_provincecount$province<-str_replace_all(xi_provincecount$province,"uygur","xinjiang")
names(provincepopulation)[2]<-"population"
names(provincepopulation)[1]<-"province"
replacement<-provincepopulation$province
names(replacement)<-"province"
combining<-cbind(replacement,provincepopulation$population)
xi_radii<-inner_join(xi_provincecount,combining)
## Joining, by = "province"
## Warning: Column `province` joining character vector and factor, coercing
## into character vector
names(xi_radii)[3]<-"population"
xi_radii$population <- as.numeric(gsub(",","",xi_radii$population))
xi_radii<-xi_radii%>%
mutate(radii=(1/((population/n)/10000000)))
datatable(xi_radii,extensions="Buttons",options = list(dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
hu_provincecount<-read_csv("hu_provincecount.csv")
## Parsed with column specification:
## cols(
## province = col_character(),
## n = col_double()
## )
hu_provincecount$province<-str_replace_all(hu_provincecount$province,"uygur","xinjiang")
names(provincepopulation)[2]<-"population"
names(provincepopulation)[1]<-"province"
replacement<-provincepopulation$province
names(replacement)<-"province"
combining<-cbind(replacement,provincepopulation$population)
hu_radii<-inner_join(hu_provincecount,combining)
## Joining, by = "province"
## Warning: Column `province` joining character vector and factor, coercing
## into character vector
names(hu_radii)[3]<-"population"
hu_radii$population <- as.numeric(gsub(",","",hu_radii$population))
hu_radii<-hu_radii%>%
mutate(radii=(1/((population/n)/10000000)))
datatable(hu_radii,extensions="Buttons",options = list(dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
cv_scrapeR<-function(webpage){
data("world.cities")
cv<-read_html(webpage)
cv_table<-cv%>%
html_nodes(xpath="//*[@class='link12']")%>%
html_text()
cv_frame_2018<-as.data.frame(cv_table)
cv_frame_2018a<-as.data.frame(cv_frame_2018[sapply(gregexpr("\\W+", cv_frame_2018$cv_table), length) >5,])
cv_frame_2018b<-as.data.frame(cv_frame_2018[sapply(gregexpr("\\d", cv_frame_2018$cv_table), length) >5,])
cv_frame_2018xi<-cbind.fill(cv_frame_2018b,cv_frame_2018a)
colnames(cv_frame_2018xi)[1:2] <- c("date", "event")
world.cities1<-world.cities%>%
filter(nchar(name)>4)%>%
filter(pop>55000)
cities<-as.list(world.cities1$name)
cities_str<-paste(unlist(cities), collapse='|')
citieslist<-as.data.frame(str_match(cv_frame_2018xi$event,cities_str))
country<-as.list(world.cities$country.etc)
country_str<-paste(unlist(country), collapse='|')
countrylist<-as.data.frame(str_extract(cv_frame_2018xi$event,country_str))
xi2018travel<-cbind(cv_frame_2018xi,citieslist,countrylist)
colnames(xi2018travel)[3:4] <- c("name", "visitorvisitor")
xi2018travel$name<-as.character(xi2018travel$name)
xi2018travel$name[xi2018travel$name=="China"]<-"Beijing"
combinedframe<-left_join(xi2018travel,world.cities1,by="name")
}
## biodata scraper
biodata_scrapeR<-function(webpage){
cv<-read_html(webpage)
cv_table<-cv%>%
html_nodes(xpath="//*[@id='dataPanel']/table")%>%
html_table()
cv_frame_2018<-as.data.frame(cv_table)
bioframe<-cv_frame_2018[,c(1,2)]
}
cv<-read_html("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=303&filter_year=2018")
cv_table<-cv%>%
html_nodes(xpath="//*[@class='link12']")%>%
html_text()
cv_frame_2018<-as.data.frame(cv_table)
cv_frame_2018a<-as.data.frame(cv_frame_2018[sapply(gregexpr("\\W+", cv_frame_2018$cv_table), length) >5,])
cv_frame_2018b<-as.data.frame(cv_frame_2018[sapply(gregexpr("\\d", cv_frame_2018$cv_table), length) >5,])
cv_frame_2018xi<-cbind.fill(cv_frame_2018b,cv_frame_2018a)
colnames(cv_frame_2018xi)[1:2] <- c("date", "event")
#string detect cities and then combine to get country, lat, long
world.cities1<-world.cities%>%
filter(nchar(name)>4)%>%
filter(pop>55000)
cities<-as.list(world.cities1$name)
cities_str<-paste(unlist(cities), collapse='|')
citieslist<-as.data.frame(str_match(cv_frame_2018xi$event,cities_str))
country<-as.list(world.cities$country.etc)
country_str<-paste(unlist(country), collapse='|')
countrylist<-as.data.frame(str_extract(cv_frame_2018xi$event,country_str))
xi2018travel<-cbind(cv_frame_2018xi,citieslist,countrylist)
colnames(xi2018travel)[3:4] <- c("name", "visitorvisitor")
xi2018travel$name<-as.character(xi2018travel$name)
xi2018travel$name[xi2018travel$name=="China"]<-"Beijing"
combinedframe<-left_join(xi2018travel,world.cities1,by="name")
# this is still subject to visual inspection because some cities are not in database, so check names = NA
framexi19<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=303&filter_year=2019")
framexi18<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=303&filter_year=2018")
framexi17<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=303&filter_year=2017")
framexi16<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=303&filter_year=2016")
framexi15<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=303&filter_year=2015")
framexi14<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=303&filter_year=2014")
framexi13<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=303&filter_year=2013")
framehu13<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2013")
framehu12<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2012")
framehu11<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2011")
framehu10<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2010")
framehu09<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2009")
framehu08<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2008")
framehu07<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2007")
framehu06<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2006")
framehu05<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2005")
framehu04<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2004")
framehu03<-cv_scrapeR("http://www.chinavitae.com/vip/index.php?mode=events&type=cv&id=19&filter_year=2003")
combined_xi<-rbind(framexi19,framexi18,framexi17,framexi16,framexi15,framexi14,framexi13)
combined_xi$country.etc[is.na(combined_xi$country.etc)==TRUE]<-"China"
combined_hu<-rbind(framehu13,framehu12,framehu11,framehu10,framehu09,framehu08,framehu07,framehu06,framehu05,framehu04,framehu03)
combined_hu$country.etc[is.na(combined_hu$country.etc)==TRUE]<-"China"
Contain all information frames
xi_overseas_details<-combined_xi%>%
filter(str_detect(tolower(combined_xi$event),"travelled")==TRUE)%>%
filter(!(country.etc=="China"))
hu_overseas_details<-combined_hu%>%
filter(str_detect(tolower(combined_hu$event),"travelled")==TRUE)%>%
filter(!(country.etc=="China"))
datatable(xi_overseas_details,extensions="Buttons",options = list(dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
datatable(hu_overseas_details,extensions="Buttons",options = list(dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
xi_overseas<-read_csv("combined_xiint.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## name = col_character(),
## country.etc = col_character(),
## year = col_double(),
## month = col_character()
## )
xi_overseastally<-xi_overseas %>%
select(date,country.etc,year)%>%
distinct(date, .keep_all = TRUE)
hu_overseas<-read_csv("combined_huint.csv")
## Parsed with column specification:
## cols(
## date = col_character(),
## name = col_character(),
## country.etc = col_character(),
## year = col_double(),
## month = col_character()
## )
hu_overseastally<-hu_overseas %>%
select(date,country.etc,year)%>%
distinct(date, .keep_all = TRUE)
xi_visitors<-combined_xi%>%
filter(str_detect(tolower(combined_xi$event),"was in beijing municipality ")==TRUE)
xi_nonavisitors<-xi_visitors%>%
drop_na(visitorvisitor)%>%
select(date,visitorvisitor)
xi_nonavisitors$year<-format(as.POSIXct(xi_nonavisitors$date,format="%m/%d/%Y"),"%Y")
xi_nonavisitors$year<-str_replace_all(xi_nonavisitors$year,"00","20")
### Days spent hosting overseas guests
xi_travelyears1<-xi_nonavisitors %>%
group_by(year) %>%
tally()
hu_visitors<-combined_hu%>%
filter(str_detect(tolower(combined_hu$event),"was in beijing municipality ")==TRUE)
hu_nonavisitors<-hu_visitors%>%
drop_na(visitorvisitor)%>%
select(date,visitorvisitor)
hu_nonavisitors$year<-format(as.POSIXct(hu_nonavisitors$date,format="%m/%d/%Y"),"%Y")
hu_nonavisitors$year<-str_replace_all(hu_nonavisitors$year,"00","20")
### Days spent hosting overseas guests
hu_travelyears1<-hu_nonavisitors %>%
group_by(year) %>%
tally()
colnames(hu_travelyears1)[2] <- c("visited_days")
colnames(xi_travelyears1)[2] <- c("visited_days")
hu_travelout<-read_csv("hutravelout.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## n = col_double()
## )
xi_travelout<-read_csv("xitravelout.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## n = col_double()
## )
colnames(hu_travelout)[2] <- c("visiting")
colnames(xi_travelout)[2] <- c("visiting")
hu_travelyears1$year<-as.numeric(hu_travelyears1$year)
xi_travelyears1$year<-as.numeric(xi_travelyears1$year)
hu_inout<-inner_join(hu_travelyears1,hu_travelout)
## Joining, by = "year"
xi_inout<-inner_join(xi_travelyears1,xi_travelout)
## Joining, by = "year"
datatable(hu_inout,extensions="Buttons",options = list(dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
datatable(xi_inout,extensions="Buttons",options = list(dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))