1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
<?php /** * Experimental PO/POT file word counter. * Word counts are approximate, including numbers and sprintf tokens. * Currently only used for source words in latin script, presumed to be in English. */ class Loco_gettext_WordCount implements Countable {
/** * @var LocoPoIterator */ private $po;
/** * Source Words: Cached count of "msgid" fields, presumed en_US * @var int */ private $sw;
/** * Create counter for a pre-parsed PO/POT file. * @param Loco_gettext_Data */ public function __construct( Loco_gettext_Data $po ){ $this->po = $po; }
/** * @internal */ private function countField( $f ){ $n = 0; foreach( $this->po as $r ){ $n += self::simpleCount( $r[$f] ); } return $n; }
/** * Default count function returns source words (msgid) in current file. * @return int */ #[ReturnTypeWillChange] public function count(){ $n = $this->sw; if( is_null($n) ){ $n = $this->countField('source'); $this->sw = $n; } return $n; }
/** * Very simple word count, only suitable for latin characters, and biased toward English. * @param string * @return int */ public static function simpleCount( $str ){ $n = 0; if( is_string($str) && '' !== $str ){
// TODO should we strip PHP string formatting? // e.g. "Hello %s" currently counts as 2 words. // $str = preg_replace('/%(?:\\d+\\$)?(?:\'.|[-+0 ])*\\d*(?:\\.\\d+)?[suxXbcdeEfFgGo%]/', '', $str );
// Strip HTML (but only if open and close tags detected, else "< foo" would be stripped to nothing if( false !== strpos($str,'<') && false !== strpos($str,'>') ){ $str = strip_tags($str); }
// always html-decode, else escaped punctuation will be counted as words $str = html_entity_decode( $str, ENT_QUOTES, 'UTF-8');
// Collapsing apostrophe'd words into single units: // Simplest way to handle ambiguity of "It's Tim's" (technically three words in English) $str = preg_replace('/(\\w+)\'(\\w)(\\W|$)/u', '\\1\\2\\3', $str ); // Combining floating numbers into single units // e.g. "£1.50" and "€1,50" should be one word each $str = preg_replace('/\\d[\\d,\\.]+/', '0', $str );
// count words by standard Unicode word boundaries $words = preg_split( '/\\W+/u', $str, -1, PREG_SPLIT_NO_EMPTY ); $n += count($words);
/*/ TODO should we exclude some words (like numbers)? foreach( $words as $word ){ if( ! ctype_digit($word) ){ $n++; } }*/ } return $n; }
}
|