synapse/rust/src/segmenter.rs
Andrew Morgan be4c95baf1
Replace PyICU with Rust icu_segmenter crate (#18553)
Co-authored-by: anoa's Codex Agent <codex@amorgan.xyz>
Co-authored-by: Quentin Gliech <quenting@element.io>
2025-07-03 11:12:12 +01:00

34 lines
1.0 KiB
Rust

use icu_segmenter::options::WordBreakInvariantOptions;
use icu_segmenter::WordSegmenter;
use pyo3::prelude::*;
#[pyfunction]
pub fn parse_words(text: &str) -> PyResult<Vec<String>> {
let segmenter = WordSegmenter::new_auto(WordBreakInvariantOptions::default());
let mut parts = Vec::new();
let mut last = 0usize;
// `segment_str` gives us word boundaries as a vector of indexes. Use that
// to build a vector of words, and return.
for boundary in segmenter.segment_str(text) {
if boundary > last {
parts.push(text[last..boundary].to_string());
}
last = boundary;
}
Ok(parts)
}
pub fn register_module(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
let child_module = PyModule::new(py, "segmenter")?;
child_module.add_function(wrap_pyfunction!(parse_words, m)?)?;
m.add_submodule(&child_module)?;
py.import("sys")?
.getattr("modules")?
.set_item("synapse.synapse_rust.segmenter", child_module)?;
Ok(())
}