Python Code#
import unicodedata
def split_clusters(txt):
""" Generate grapheme clusters for the Devanagari text."""
cluster = u''
end = False
for char in txt:
category = unicodedata.category(char)
if (category == 'Lo' and end ) or category[0] == 'M':
cluster = cluster + char
else:
if cluster:
yield cluster
cluster = char
end = unicodedata.name(char).endswith(' SIGN VIRAMA')
if cluster:
yield cluster
Go Code#
import (
"strings"
"unicode"
"golang.org/x/text/unicode/runenames"
)
func splitClusters(txt string) (ret []string) {
cluster := ""
end := false
for _, x := range txt {
if (unicode.In(x, unicode.Lo) && end) ||
unicode.In(x, unicode.M, unicode.Mc, unicode.Me, unicode.Mn) {
cluster += string(x)
} else {
if len(cluster) > 0 {
if strings.TrimSpace(cluster) != "" {
ret = append(ret, cluster)
}
}
cluster = string(x)
}
end = strings.HasSuffix(runenames.Name(x), " SIGN VIRAMA")
}
if len(cluster) > 0 {
if strings.TrimSpace(cluster) != "" {
ret = append(ret, cluster)
}
}
return
}