-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathmeasures.tex
78 lines (61 loc) · 2.64 KB
/
measures.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
\documentclass[11pt]{scrartcl}
\usepackage[utf8]{inputenc}
\usepackage{lmodern}
\usepackage[T1]{fontenc}
\usepackage{microtype}
\usepackage{hyperref}
\usepackage{natbib}
\setcitestyle{authoryear,round,semicolon,aysep={,},yysep={,},notesep={:~}}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{xspace}
\makeatletter
% id est
\newcommand{\ie}{i.\,e\@ifnextchar.{}{.\@\xspace}}
% exempli gratia
\newcommand{\eg}{e.\,g\@ifnextchar.{}{.\@\xspace}}
% et cetera
\newcommand{\etc}{etc\@ifnextchar.{}{.\@\xspace}}
% confer
\newcommand{\cf}{cf\@ifnextchar.{}{.\@\xspace}}
% versus
\newcommand{\vs}{vs\@ifnextchar.{}{.\@\xspace}}
\makeatother
\title{}
\author{}
\date{\today}
\begin{document}
\maketitle
\begin{description}
\item[$N$:] Text length, \ie number of tokens
\item[$V(N)$:] Vocabulary size, \ie number of types
\item[$V(i, N)$:] Number of types occurring $i$ times
\end{description}
\section{Measures that use sample size and vocabulary size}
\[\text{type-token ratio} = \frac{V(N)}{N}\]
\[\text{Guiraud's } R = \frac{V(N)}{\sqrt{N}}\]
\[\text{Herdan's } C = \frac{\log(V(N))}{\log(N)}\]
\[\text{Dugast's } k = \frac{\log(V(N))}{\log(\log(N))}\]
\[\text{Maas' } a^2 = \frac{\log(N) - \log(V(N))}{\log(N)^2}\]
\[\text{Dugast's } U = \frac{\log(N)^2}{\log(N) - \log(V(N))}\]
\[\text{Tuldava's } \textit{LN} = \frac{1 - V(N)^2}{V(N)^2\log(N)}\]
\[\text{Brunet's } W = N^{V(N)^{-a}} \text{ with } a = -0.172\]
\[\text{Carroll's } \textit{CTTR} = \frac{V(N)}{\sqrt{2 N}}\]
\[\text{Summer's } S = \frac{\log(\log(V(N)))}{\log(\log(N))}\]
\section{Measures that use part of the frequency spectrum}
\[\text{Honoré's } H = 100 \frac{\log(N)}{1 - \frac{V(1, N)}{V(N)}}\]
\[\text{Sichel's } S = \frac{V(2, N)}{V(N)}\]
\[\text{Michéa's } M = \frac{V(N)}{V(2, N)}\]
\section{Measures that use the whole frequency spectrum}
\[\text{Entropy} = \sum_{i=1}^N V(i, N)\left(-\log(\frac{i}{N})\right)\frac{i}{N}\]
\[\text{Yule's } K = 10^4 \left(-\frac{1}{N} + \sum_{i=1}^N V(i, N) \left( \frac{i}{N}\right)^2 \right)\]
\[\text{Simpson's } D = \sum_{i=1}^{V(N)} V(i, N) \frac{i}{N} \frac{i - 1}{N - 1}\]
\[\text{Herdan's } V_m = \sqrt{-\frac{1}{V(N)} + \sum_{i=1}^{V(N)} V(i, N) \left(\frac{i}{N}\right)^2}\]
\[\text{McCarthy and Jarvis' } \textit{HD-D} = \sum_{i=1}^{V(N)} \frac{1}{42} \left(1 - \frac{\binom{i}{0} \binom{N - V(i, N)}{42 - 0}}{\binom{N}{42}}\right) = \sum_{i=1}^{V(N)} \frac{1}{42} \left(1 - \frac{\binom{N - V(i, N)}{42}}{\binom{N}{42}}\right)\]
% sum(((1 - scipy.stats.hypergeom.pmf(0, text_length, freq, sample_size)) / sample_size for word, freq in frequency_spectrum.items()))
\end{document}
%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End: