[R] Stringr 패키지

데이터 분석/텍스트 분석 2022. 4. 16. 00:02

Stringr 패키지 설치 및 실행

string <- c("data analysis is useful",
        "business analytics is helpful",
        'visualization of data is interestiong for data scientidyd')

install.packages('stringr')
library(stringr)

str_detect()

grepl() 함수와 같이 입력한 pattern 파라미터와 동일한 문자열을 가진 문자열 벡터는 TRUE, 그렇지 않은 경우 FALSE를 출력하는 함수

str_detect(string = string, pattern = 'data')
# [1]  TRUE FALSE  TRUE

str_detect(string, 'DATA') # 대소문자 구분
# [1] FALSE FALSE FALSE

# 대소문자 구분없이 확인
str_detect(string, fixed("DATA", ignore_case = TRUE))
# [1]  TRUE FALSE  TRUE

# fixed 활용, '.'을 메타문자가 아닌 문자로 취급하여 함수 진행 
str_detect(c('aby','acy','a.y'), 'a.y')
# [1] TRUE TRUE TRUE
str_detect(c('aby','acy','a.y'), fixed('a.y'))
# [1] FALSE FALSE  TRUE

# 이스케이프 문자를 써도 동일한 결과 출력
str_detect(c('aby','acy','a.y'),'a\\.y')
# [1] FALSE FALSE  TRUE

# negate 파라미터에 TRUE를 입력하면 pattern이 포함되지 않은 문자열을 TRUE로 출력(파라미터 default 값 : FALSE)
str_detect(string = string, pattern = 'data', negate = TRUE) 
# [1] FALSE  TRUE FALSE

str_locate()

string 파라미터에 입력받은 문자열 백터에 pattern 파라미터에 입력받은 문자열이 있을 경우 각 문자열 벡터당 1개씩 그 문자열의 시작 위치와 끝나는 위치의 인덱스를 행렬로 출력하는 함수

str_locate_all()

str_locate()와 달리 각 문자열 벡터당 1개씩 pattern 파라미터에 입력받은 문자열의 시작위치와 끝 위치를 출력하는 것이 아니라 모든 부분을 출력하고 각 문자열 벡터의 결과를 리스트로 구분하여 행렬로 출력하는 함수

str_locate(string, 'data')
'''
     start end
[1,]     1   4
[2,]    NA  NA
[3,]    18  21
'''

str_locate_all(string, 'data')
'''
[[1]]
     start end
[1,]     1   4

[[2]]
    start end

[[3]]
     start end
[1,]    18  21
[2,]    43  46
'''

str_extract(), str_extract_all()

문자열 추출, regmatches()와 같은 기능

str_extract(string, 'data')
# [1] "data" NA     "data"

str_extract_all(string, 'data') # 각 문자열 단위로 리스트 출력 
'''
[[1]]
[1] "data"

[[2]]
character(0)

[[3]]
[1] "data" "data"
'''

str_extract_all(string, 'data', simplify = TRUE) # 행렬 형태로 출력
'''
     [,1]   [,2]  
[1,] "data" ""    
[2,] ""     ""    
[3,] "data" "data"
'''

활용

unlist(str_extract_all(string, 'data'))
# [1] "data" "data" "data"

sentences5 <- sentences[1:5] 
# sentences 인덱스 1~4까지 데이터를 sentences5에 할당

str_extract(sentences5, '(a|A|the|The) (\\w+)')
# [1] "The birch" "the sheet" "the depth" "a chicken"
# [5] NA         

str_extract_all(sentences5, '(a|A|the|The) (\\w+)')
'''
[[1]]
[1] "The birch"  "the smooth"

[[2]]
[1] "the sheet" "the dark" 

[[3]]
[1] "the depth" "a well"   

[[4]]
[1] "a chicken" "a rare"   

[[5]]
character(0)

'''

str_match(), str_match_all()

pattern과 매칭되는 문자열과 문자열의 부분집합까지 행렬 형태로 출력

str_match(sentences5, '(a|A|the|The) (\\w+)')
'''
     [,1]        [,2]  [,3]     
[1,] "The birch" "The" "birch"  
[2,] "the sheet" "the" "sheet"  
[3,] "the depth" "the" "depth"  
[4,] "a chicken" "a"   "chicken"
[5,] NA          NA    NA    
'''

str_match_all(sentences5, '(a|A|the|The) (\\w+)')
'''
[[1]]
  [,1]         [,2]  [,3]    
[1,] "The birch"  "The" "birch" 
[2,] "the smooth" "the" "smooth"

[[2]]
   [,1]        [,2]  [,3]   
[1,] "the sheet" "the" "sheet"
[2,] "the dark"  "the" "dark" 

[[3]]
     [,1]        [,2]  [,3]   
[1,] "the depth" "the" "depth"
[2,] "a well"    "a"   "well" 

[[4]]
     [,1]        [,2] [,3]     
[1,] "a chicken" "a"  "chicken"
[2,] "a rare"    "a"  "rare"   

[[5]]
     [,1] [,2] [,3]
'''

str_replace(), str_replace_all()

문자열 치환

str_replace(string, 'data', 'text') # 각 문자열에 pattern과 같은 부분 1개씩만 치환
'''
[1] "text analysis is useful"                                  
[2] "business analytics is helpful"                            
[3] "visualization of text is interestiong for data scientidyd"
'''

str_replace_all(string, 'data', 'text') # 각 문자열에 pattern과 같은 부분 모두 치환
'''
[1] "text analysis is useful"                                  
[2] "business analytics is helpful"                            
[3] "visualization of text is interestiong for text scientidyd"
'''

str_split()

문자열 분할

str_split(string, ' ')
'''
[[1]]
[1] "data"     "analysis" "is"       "useful"  

[[2]]
[1] "business"  "analytics" "is"        "helpful"  

[[3]]
[1] "visualization" "of"            "data"         
[4] "is"            "interestiong"  "for"          
[7] "data"          "scientidyd"   
'''
str_split(sentences5, ' ')
'''
[[1]]
[1] "The"     "birch"   "canoe"   "slid"    "on"     
[6] "the"     "smooth"  "planks."

[[2]]
[1] "Glue"        "the"         "sheet"      
[4] "to"          "the"         "dark"       
[7] "blue"        "background."

[[3]]
[1] "It's"  "easy"  "to"    "tell"  "the"   "depth"
[7] "of"    "a"     "well."

[[4]]
[1] "These"   "days"    "a"       "chicken" "leg"    
[6] "is"      "a"       "rare"    "dish."  

[[5]]
[1] "Rice"   "is"     "often"  "served" "in"     "round" 
[7] "bowls."
'''

분할 활용

unlist(str_split(sentences5,' '))
'''
 [1] "The"         "birch"       "canoe"      
[4] "slid"        "on"          "the"        
[7] "smooth"      "planks."     "Glue"       
[10] "the"         "sheet"       "to"         
[13] "the"         "dark"        "blue"       
[16] "background." "It's"        "easy"       
[19] "to"          "tell"        "the"        
[22] "depth"       "of"          "a"          
[25] "well."       "These"       "days"       
[28] "a"           "chicken"     "leg"        
[31] "is"          "a"           "rare"       
[34] "dish."       "Rice"        "is"         
[37] "often"       "served"      "in"         
[40] "round"       "bowls." 
'''

unique(unlist(str_split(sentences5, ' ')))
'''
 [1] "The"         "birch"       "canoe"      
[4] "slid"        "on"          "the"        
[7] "smooth"      "planks."     "Glue"       
[10] "sheet"       "to"          "dark"       
[13] "blue"        "background." "It's"       
[16] "easy"        "tell"        "depth"      
[19] "of"          "a"           "well."      
[22] "These"       "days"        "chicken"    
[25] "leg"         "is"          "rare"       
[28] "dish."       "Rice"        "often"      
[31] "served"      "in"          "round"      
[34] "bowls."     
'''

str_split() 파라미터 설명

# n 파라미터, 각 문자열에서 출력되는 문자열 제한
str_split(sentences5, ' ', n=5)

'''

[[1]]
[1] "The" "birch"
[3] "canoe" "slid"
[5] "on the smooth planks."

[[2]]
[1] "Glue"
[2] "the"
[3] "sheet"
[4] "to"
[5] "the dark blue background."

[[3]]
[1] "It's" "easy"
[3] "to" "tell"
[5] "the depth of a well."

[[4]]
[1] "These" "days"
[3] "a" "chicken"
[5] "leg is a rare dish."

[[5]]
[1] "Rice" "is" "often"
[4] "served" "in round bowls."

'''

# simplify 파라미터, 행렬로 출력

str_split(sentences5, ' ', simplify = TRUE)
'''
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] "The" "birch" "canoe" "slid" "on" "the"
[2,] "Glue" "the" "sheet" "to" "the" "dark"
[3,] "It's" "easy" "to" "tell" "the" "depth"
[4,] "These" "days" "a" "chicken" "leg" "is"
[5,] "Rice" "is" "often" "served" "in" "round"
[,7] [,8] [,9]
[1,] "smooth" "planks." ""
[2,] "blue" "background." ""
[3,] "of" "a" "well."
[4,] "a" "rare" "dish."
[5,] "bowls." ""
'''

기타 유용한 함수

str_length()

str_length(string) # 각 문자열의 글자 수 출력
# [1] 23 29 57

str_count()

str_count(string, 'data') # 각 문자열의 'data' 개수 출력
# [1] 1 0 2

str_count(string, '\w+') # 각 문자열의 단어 개수 출력
# [1] 4 4 8

str_pad()

mon <- 1:12
str_pad(mon, width = 3, side = 'left', pad = '0')
# [1] "001" "002" "003" "004" "005" "006" "007" "008" "009"
# [10] "010" "011" "012"

## width 파라미터: 각 문자열 길이 제한
## side 파라미터: pad를 넣을 방향 지정,(both, right, left)
## pad 파라미터: pad로 사용될 문자열 사용

str_trim(), 문자열의 공백 제거

string_pad <- str_pad(string, width = max(str_length(string)),
side = 'both', pad = ' ')

string_pad
'''
[1] " data analysis is useful "
[2] " business analytics is helpful "
[3] "visualization of data is interestiong for data scientidyd"
'''

str_trim(string_pad)
'''
[1] "data analysis is useful"
[2] "business analytics is helpful"
[3] "visualization of data is interestiong for data scientidyd"
'''

# side 파라미터: 공백을 제거할 방향 지정,(both, right, left)

'데이터 분석 > 텍스트 분석' 카테고리의 다른 글

[R] KoNLP 패키지 설치 (0)	2022.05.08
[R] 텍스트 패턴 찾기, 위치 정보 수집, 수정 함수 (0)	2022.04.14
[R] Do it R 텍스트마이닝_1 (0)	2022.04.11
[R] sub(), gsub(), 정규 표현식 (0)	2022.04.08
[R]함수 paste(), paste0(), outer() 등 (0)	2022.04.02

ABOUT ME

공부방 공부방

Stringr 패키지 설치 및 실행

str_detect()

str_locate()

str_locate_all()

str_extract(), str_extract_all()

문자열 추출, regmatches()와 같은 기능

활용

str_match(), str_match_all()

str_replace(), str_replace_all()

str_split()

기타 유용한 함수

str_length()

str_count()

str_pad()

str_trim(), 문자열의 공백 제거

'데이터 분석 > 텍스트 분석' 카테고리의 다른 글

티스토리툴바

ABOUT ME

Stringr 패키지 설치 및 실행

str_detect()

str_locate()

str_locate_all()

str_extract(), str_extract_all()

문자열 추출, regmatches()와 같은 기능

활용

str_match(), str_match_all()

str_replace(), str_replace_all()

str_split()

기타 유용한 함수

str_length()

str_count()

str_pad()

str_trim(), 문자열의 공백 제거

'데이터 분석 > 텍스트 분석' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바