R Usefull Code.txt

############################ Text Analytics#####################################

### trimming the text
 ########### v is the variable with text data
library(stringr)

df3$activity_name<-str_trim(df3$activity_name, side=c('both'))

### tRemove alphanumeric

df3$v<-gsub("[^[:alnum:]' ]", "", df3$v)

### tRemove words with more that 3 characters

df3$v<-gsub('\\b\\w{1,3}\\b','', df3$v)

### tRemove white spaces within sentence

df3$v<-gsub('"\\s+"','', df3$v)

### tRemove all numbers###

df3$v<-gsub('[0-9]+','', df3$v)


##################### Removing Variable with Zeroor nearzero variance########
df_cat_dum<-createDummyFeatures(df, cols=c('v1','v2','v3'))

x = nearZeroVar(df_cat_dum)

df_cat_dum_final<-df_cat_dum[, -x]


#### Working With H20 on R########################

library(h2o)
h2o.init()

data.hex<-as.h2o(df_analytical_v2)

splits <- h2o.splitFrame(data.hex, seed = 1234,
                         destination_frames=c("train.hex", "test.hex"),
                         ratios = 0.75)
train <- splits[[1]]
test <- splits[[2]]

response <- "target"

predictors <- c("v1", "v2", "v3", "v4",)

data.gbm<-h2o.gbm(y = response, x = predictors, training_frame =
          train, 
          validation_frame = test,
          score_tree_interval = 10, ntrees = 500,
          sample_rate = 0.8, col_sample_rate = 0.8, seed = 1234,distribution= "gaussian")
          
 https://www.h2o.ai/wp-content/uploads/2018/01/RBooklet.pdf