Split Indic Words

Python Code import unicodedata def split_clusters(txt): """ Generate grapheme clusters for the Devanagari text.""" cluster = u'' end = False for char in txt: category = unicodedata.category(char) if (category == 'Lo' and end ) or category[0] == 'M': cluster = cluster + char else: if cluster: yield cluster cluster = char end = unicodedata.name(char).endswith(' SIGN VIRAMA') if cluster: yield cluster Go Code import ( "strings" "unicode" "golang.org/x/text/unicode/runenames" ) func splitClusters(txt string) (ret []string) { cluster := "" end := false for _, x := range txt { if (unicode.In(x, unicode.Lo) && end) || unicode.In(x, unicode.M, unicode.Mc, unicode.Me, unicode.Mn) { cluster += string(x) } else { if len(cluster) > 0 { if strings.TrimSpace(cluster) != "" { ret = append(ret, cluster) } } cluster = string(x) } end = strings.HasSuffix(runenames.Name(x), " SIGN VIRAMA") } if len(cluster) > 0 { if strings.TrimSpace(cluster) != "" { ret = append(ret, cluster) } } return }

जानेवारी 9, 2023 · 1 min · 150 words · शंतनू