R regex for extraction of complex string -


i have messy set of strings follows.

string <- c("grp-14994/", "grp-7056 grp-7036/", "grp-24263(24263)/irgc 28588", "grp-15916 /irgc-42176",             "grp-614-250b/", "( grp 11432)/irgc-14570", "tourn", "grpp256", "purse", "grp-14956 origin:", "grp 10537", "grp-10096 origin: ",             "sgrp123", "grp1234", "ac-30009 (grphana)/", "ac-3060 grp 536-143/old ac", "rgrpfaa/23", "/-",             "mgr:7251/", "1216-gr-567/", "x:1 kgrph", "wabgrpvea(ii)", "hr33(bgrp)", "tensor",             "wald", "grp12312") 

i trying extract instances grp followed digits, may separated space or "-".

my current attempt gives me following result.

gsub("(.*)(\\b)(grp)(-|\\s|)(\\d+)(\\/|\\b)(.*)","\\3\\5", string, ignore.case = t)  [1] "grp14994"            "grp7056"             "grp24263"            "grp15916"             [5] "grp614"              "grp11432"            "tourn"               "grpp256"              [9] "purse"               "grp14956"            "grp10537"            "grp10096"            [13] "sgrp123"             "grp1234"             "ac-30009 (grphana)/" "grp536"              [17] "rgrpfaa/23"          "/-"                  "mgr:7251/"           "1216-gr-567/"        [21] "x:1 kgrph"      "wabgrpvea(ii)"       "hr33(bgrp)"          "tensor"              [25] "wald"                "grp12312"       

but desired output ris

out <-  c("grp14994", "grp7056 grp7036", "grp24263", "grp15916", "grp614250",  "grp11432", "", "", "", "grp14956", "grp10537", "grp10096", "",  "grp1234", "", "grp536143", "", "", "", "", "", "", "", "", "",  "grp12312")  out  [1] "grp14994"        "grp7056 grp7036" "grp24263"        "grp15916"        "grp614250"       "grp11432"         [7] ""                ""                ""                "grp14956"        "grp10537"        "grp10096"        [13] ""                "grp1234"         ""                "grp536143"       ""                ""                [19] ""                ""                ""                ""                ""                ""                [25] ""                "grp12312"     

how modify regex desired result?

unlist(lapply(str_extract_all(string,"[gg][rr][pp][-\\s]?\\d+"), function (x) { gsub("[-\\s]+(\\d)", "\\1", paste(x, collapse= " "),perl=t) }))  [1] "grp14994"        "grp7056 grp7036" "grp24263"         [4] "grp15916"        "grp614"          "grp11432"         [7] ""                ""                ""                [10] "grp14956"        "grp10537"        "grp10096"        [13] "grp123"          "grp1234"         ""                [16] "grp536"          ""                ""                [19] ""                ""                ""                [22] ""                ""                ""                [25] ""                "grp12312"   

Comments