# dados e api
# hadley
posts %>%
filter(from_name %in% c('Doris S M Fontes',
'Guilherme Carrara Neto')) %>%
mutate(time = ymd_hms(created_time)) %>%
arrange(time) %>%
group_by(from_name) %>%
mutate(um = 1, acu = cumsum(um)) %>%
ungroup %>%
ggplot(aes(x = time, y = acu, colour = from_name)) +
geom_line() +
Primeiro, vamos dar uma limpada nos textos. Consideramos só os textos em portugues, de inicio!
- ignore case
- remove punctuation
- remove stop words
- remove accents
- remove numbers
- remove links and emails
prof <- TC_byte_profiles[names(TC_byte_profiles) %in% c("english", "portuguese")]
posts_messages <- posts %>%
filter(!is.na(message), type %in% c('status', 'link')) %>%
mutate(lingua = textcat(message, p = prof)) %>%
filter(lingua == 'portuguese')
# rm_accent SO FUNCIONA EM LINUX!!!!!!!!!!!!!!!!!!!!
rm_accent <- function(x) gsub("`|\\'", "", iconv(x, to = "ASCII//TRANSLIT"))
tira_barras <- function(x) gsub('/+|(\\:)+', ' ', x)
tira_emails <- function(x) gsub('[a-zA-Z]+@[a-zA-Z\\-]+\\.[a-zA-Z]+', '', x)
tira_links <- function(x) gsub('http[^ ]+', ' ', x)
# tira_s <- function(x) {gsub('spss', 'spsss', x); gsub('s ', ' ', x)}
banned_words <- c('vaga[^ ]*', 'todo', 'toda', 'trabalh[^ ]*',
'paulo', 'acompanha[^ ]*', 'principal[^ ]*',
'profission[^ ]*', 'estatistic[^ ]*',
'expiracao', 'janeiro', 'fevereiro', 'marco',
'abril', 'maio', 'junho', 'julho', 'agosto',
'setembro', 'outubro', 'novembro', 'dezembro',
'senior', 'pleno', 'dado', 'data', 'local',
'codigo', 'nivel', 'hierarquico', 'rio', 'junior',
'estagio', 'quantidade', ' [a-z] ')
banned_words <- paste(banned_words, collapse = '|')
tira_banned <- function(x) gsub(banned_words, ' ', x)
d_tm_raw <- VCorpus(VectorSource(posts_messages$message))
d_tm <- d_tm_raw %>%
tm_map(stripWhitespace) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(content_transformer(tira_emails)) %>%
tm_map(content_transformer(tira_links)) %>%
tm_map(content_transformer(tira_barras)) %>%
tm_map(removeWords, stopwords('pt-br')) %>%
tm_map(removePunctuation, preserve_intra_word_dashes = TRUE) %>%
tm_map(content_transformer(rm_accent)) %>%
tm_map(content_transformer(tira_links)) %>%
tm_map(removeWords, unique(rm_accent(stopwords('pt-br')))) %>%
tm_map(content_transformer(tira_banned)) %>%
# tm_map(content_transformer(tira_s)) %>%
tm_map(removeNumbers) %>%
ctrl <- list(weighting = function(x) weightTfIdf(x, normalize = FALSE))
dtm <- DocumentTermMatrix(d_tm, control = ctrl)
dtm <- removeSparseTerms(dtm, 0.95)
# como padrao dtm é uma matriz esparsa, boa pra guardar dados mas ruim pra explorar
# por sorte a base de dados é pequena e da pra transformar em data.frame
d_dtm <- data.frame(as.matrix(dtm)) %>%
add_rownames() %>%
# pegando as 10 palavras "mais informativas" de cada documento
palavras <- d_dtm %>%
mutate(rowname = as.numeric(rowname)) %>%
gather(key, val, -rowname) %>%
arrange(desc(val)) %>%
group_by(rowname) %>%
slice(1:10) %>%
ungroup %>%
filter(val > 0)
max.words = 100,
colors = brewer.pal(8, 'Dark2'))
Tipos de posts
posts %>%
count(type, sort = TRUE) %>%
mutate(` %` = round(n/sum(n) * 100, 2))
Note que são dois tipos de posts principais, status e links. Vamos analisar separadamente.
status <- posts %>%
filter(type %in% 'status')
link <- posts %>%
filter(type %in% 'link')
Em relação aos status, podemos notar que existe uma grande quantidade de vagas do site www:.vagas.com.br
status %>%
mutate(vagas = str_detect(message, 'www\\.vagas\\.com\\.br')) %>%
count(vagas, sort = TRUE)
Geralmente, os posts desse site contém um padrão conhecido.
aux <- status %>%
mutate(vagas = str_detect(message, 'www\\.vagas\\.com\\.br')) %>%
filter(vagas) %>%
slice(1) %>%
select(from_name, message) %>%
sapply(function(x) {cat(x); cat('\n\n@@@@@@@\n\n')})
## Guilherme Carrara Neto
## @@@@@@@
## Latin America Strategic Workforce Planning Coordinator (GE - General Electric)
## Nível hierárquico:Supervisão/Coordenação
## Local:São Paulo / SP / BR
## Quantidade de vagas:1
## Role Summary/Purpose
## We are looking for a Strategic Workforce Planning Coordinator to join a growing team focused on delivering a data-driven consultative experience to internal business clients. Through consultative engagements, the SWP team quantifies and assesses the supply and demand of talent required to successfully inform and execute business strategy.
## Essential Responsibilities
## The SWFP Coordinator would be engaged in developing and delivering insights for complex workforce planning projects and programs using inputs from a variety of data sources, business leaders and HR Business Partners. This role will drive the projects derived from the WPF analysis. Areas of concentration include consulting, project management, headcount forecasting, scenario planning, and gap analysis.
## Consult with business partners to identify workforce needs, develop hypotheses and execute related quantitative analyses, (e.g. attrition, retirement, and movement)
## Lead the development of a compelling story through data and transform complex data elements into a consumable format for various levels of management.
## Synthesize analyses through activities into comprehensive presentations to foster discussion and communicate key insights.
## Identify and incorporate multiple data elements to support analysis. Facilitate conversations with data owners to ensure proper utilization and management of the data.
## Execute research and analysis and interpret findings to support the identification of solutions that address organizational gaps.
## Lead all the project management activities to ensure the thorough execution of special projects derives from WFP analysis. Continued development of key data sources, tools, templates and processes.
## Qualifications/Requirements
## Bachelor's degree in economics, Mathematics, Statistics, human resources or a related field.
## 3-7 years of analytical and/or consulting experience.
## strong data management skills
## Demonstrated ability to apply consultative mind-set, problem-solving and critical thinking.
## Team player with strong interpersonal and executive communication.
## Ability to balance multiple priorities simultaneously and meet deadlines as required.
## Proficiency in Microsoft Office Suite (Word, Excel, Access, PowerPoint, Plateu, Datamanagement)
## Desired Characteristics
## Strategic workforce planning experience
## Statistic and project management experience.
## http://www.vagas.com.br/vagas/v1195884/latin-america-strategic-workforce-planning-coordinator
## @@@@@@@
Algumas informações úteis que podem ser extraídas:
- Nível hierárquico
- Nome da vaga (analista, estatístico, etc)
- Atividades
- Requisitos
antes <- c('([^a-zA-Z]|^)formacao\\:?\n?( e pre\\-requisitos\\:)?',
re <- sprintf('(%s)([^\n]+)', paste(antes, collapse = '|'))
vagas <- status %>%
mutate(vagas = str_detect(message, 'www\\.vagas\\.com\\.br')) %>%
filter(vagas) %>%
mutate(nivel = str_match(message, 'Nível hierárquico\\:([^\n]+)\n')[, 2]) %>%
mutate(local = str_match(message, 'Local\\:([^\n]+)\n')[, 2]) %>%
mutate(req = str_match(message, regex('req[^:]+\\:([^|]+)',
ignore_case = TRUE))[, 2]) %>%
mutate(form = str_match(tolower(tjsp::rm_accent(message)), re)[, 7])
Em relação ao nível hierárquico, temos o seguinte resultado:
vagas %>%
count(nivel, sort = TRUE) %>%
mutate(` %` = round(n/sum(n) * 100, 2)) %>%
nivel | n | % |
Estágio | 221 | 23.94 |
Pleno | 212 | 22.97 |
Júnior/Trainee | 199 | 21.56 |
Sênior | 189 | 20.48 |
Supervisão/Coordenação | 43 | 4.66 |
Auxiliar/Operacional | 25 | 2.71 |
NA | 17 | 1.84 |
Gerência | 13 | 1.41 |
Técnico | 4 | 0.43 |
Em relação ao local, temos:
vagas %>%
count(local, sort = TRUE) %>%
mutate(` %` = round(n/sum(n) * 100, 2)) %>%
head(10) %>%
local | n | % |
São Paulo / SP / BR | 274 | 29.69 |
Rio de Janeiro / RJ / BR | 130 | 14.08 |
Curitiba / PR / BR | 38 | 4.12 |
Rio de Janeiro / RJ / BR - Centro | 30 | 3.25 |
Belo Horizonte / MG / BR | 27 | 2.93 |
NA | 24 | 2.60 |
Barueri / SP / BR | 20 | 2.17 |
Brasil | 19 | 2.06 |
São Paulo / BR | 17 | 1.84 |
Campinas / SP / BR | 16 | 1.73 |
O grafico abaixo mostra quais os termos mais frequentes quando se fala em formação.
# aux <- vagas %>%
# sample_n(1) %>%
# select(from_name, message, form) %>%
# sapply(function(x) {cat(x); cat('\n\n@@@@@@@\n\n')})
cursos <- vagas %>%
filter(!is.na(form)) %>%
mutate(matematica = ifelse(str_detect(form, 'matematica'), 'matematica', NA)) %>%
mutate(contabeis = ifelse(str_detect(form, 'ciencias cont|contab'),
'ciencias contabeis', NA)) %>%
mutate(engenharia = ifelse(str_detect(form, 'engenharia'), 'engenharia', NA)) %>%
mutate(economia = ifelse(str_detect(form, 'economia'), 'economia', NA)) %>%
mutate(financas = ifelse(str_detect(form, 'financas'), 'financas', NA)) %>%
mutate(atuaria = ifelse(str_detect(form, 'atuari'), 'atuaria', NA)) %>%
mutate(administracao = ifelse(str_detect(form, 'admin'), 'administracao', NA)) %>%
mutate(marketing = ifelse(str_detect(form, 'marketing'), 'marketing', NA)) %>%
mutate(informatica = ifelse(str_detect(form, 'inform|compu'), 'informatica', NA)) %>%
mutate(estatistica = ifelse(str_detect(form, 'estat'), 'estatistica', NA))
cursos %>%
summarise_each(funs(sum(!is.na(.))), matematica:estatistica) %>%
gather() %>%
ggplot(aes(x = reorder(key, X = value), y = value, fill = key)) +
geom_bar(stat = 'identity') +
guides(fill = FALSE) +
theme_bw() +
xlab('formacao') +
ylab('quantidade') +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12))
O grafico abaixo mostra quais os termos mais frequentes quando se fala em formação, apenas quando aparece o nome "estatistica".
cursos %>%
filter(!is.na(estatistica)) %>%
summarise_each(funs(sum(!is.na(.))), matematica:estatistica) %>%
gather() %>%
mutate(value = value / value[key=='estatistica']) %>%
filter(key != 'estatistica') %>%
ggplot(aes(x = reorder(key, X = value), y = value, fill = key)) +
geom_bar(stat = 'identity') +
guides(fill = FALSE) +
theme_bw() +
scale_y_continuous(labels = scales::percent, breaks = 0:10/10) +
xlab('formacao') +
ylab('proporcao') +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 12))
- Pegar atividades
- Pegar requisitos
- Pegar desejáveis