Struct CountVectorizerValidParams

Source

pub struct CountVectorizerValidParams { /* private fields */ }

Expand description

Count vectorizer: learns a vocabulary from a sequence of documents (or file paths) and maps each vocabulary entry to an integer value, producing a CountVectorizer that can be used to count the occurrences of each vocabulary entry in any sequence of documents. Alternatively a user-specified vocabulary can be used for fitting.

§Attributes

If a user-defined vocabulary is used for fitting then the following attributes will not be considered during the fitting phase but they will still be used by the CountVectorizer to transform any text to be examined.

split_regex: the regex espression used to split decuments into tokens. Defaults to r“\b\w\w+\b“, which selects “words”, using whitespaces and punctuation symbols as separators.
convert_to_lowercase: if true, all documents used for fitting will be converted to lowercase. Defaults to true.
n_gram_range: if set to (1,1) single tokens will be candidate vocabulary entries, if (2,2) then adjacent token pairs will be considered, if (1,2) then both single tokens and adjacent token pairs will be considered, and so on. The definition of token depends on the regex used fpr splitting the documents. The default value is (1,1).
normalize: if true, all charachters in the documents used for fitting will be normalized according to unicode’s NFKD normalization. Defaults to true.
document_frequency: specifies the minimum and maximum (relative) document frequencies that each vocabulary entry must satisfy. Defaults to (0., 1.) (i.e. 0% minimum and 100% maximum)
stopwords: optional list of entries to be excluded from the generated vocabulary. Defaults to None

Struct CountVectorizerValidParamsCopy item path

§Attributes

Implementations§

impl CountVectorizerValidParams

pub fn tokenizer_function(&self) -> Option<fn(&str) -> Vec<&str>>

pub fn max_features(&self) -> Option<usize>

pub fn convert_to_lowercase(&self) -> bool

pub fn split_regex(&self) -> Ref<'_, Regex>

pub fn n_gram_range(&self) -> (usize, usize)

pub fn normalize(&self) -> bool

pub fn document_frequency(&self) -> (f32, f32)

pub fn stopwords(&self) -> &Option<HashSet<String>>

impl CountVectorizerValidParams

pub fn fit<T: ToString + Clone, D: Data<Elem = T>>( &self, x: &ArrayBase<D, Ix1>, ) -> Result<CountVectorizer>

pub fn fit_files<P: AsRef<Path>>( &self, input: &[P], encoding: EncodingRef, trap: DecoderTrap, ) -> Result<CountVectorizer>

pub fn fit_vocabulary<T: ToString>( &self, words: &[T], ) -> Result<CountVectorizer>

Trait Implementations§

impl Clone for CountVectorizerValidParams

fn clone(&self) -> CountVectorizerValidParams

fn clone_from(&mut self, source: &Self)

impl Debug for CountVectorizerValidParams

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Auto Trait Implementations§

impl !Freeze for CountVectorizerValidParams

impl !RefUnwindSafe for CountVectorizerValidParams

impl Send for CountVectorizerValidParams

impl !Sync for CountVectorizerValidParams

impl Unpin for CountVectorizerValidParams

impl UnwindSafe for CountVectorizerValidParams

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<SS, SP> SupersetOf<SS> for SPwhere SS: SubsetOf<SP>,

fn to_subset(&self) -> Option<SS>

fn is_in_subset(&self) -> bool

unsafe fn to_subset_unchecked(&self) -> SS

fn from_subset(element: &SS) -> SP

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

Struct CountVectorizerValidParams

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<SS, SP> SupersetOf<SS> for SP
where SS: SubsetOf<SP>,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,