Context Navigation

Back to Zeichen

Zeichen: neo_zeichen.php

File neo_zeichen.php, 9.6 KB (added by stephan, 17 years ago)
Das Skript, um die Zeichen-Liste zu erstellen, Benutzung auf eigene Gefahr.

Line
1	<?php
2	# FIXME Compose auslesen
3
4	# Funktionen
5	# -------------------------------
6
7	# Unicode Codepoint Funktion von http://iki.fi/hsivonen/php-utf8/
8	# (GPL)
9	function utf8ToUnicode(&$str)
10	{
11	$mState = 0; // cached expected number of octets after the current octet
12	// until the beginning of the next UTF8 character sequence
13	$mUcs4 = 0; // cached Unicode character
14	$mBytes = 1; // cached expected number of octets in the current sequence
15
16	$out = array();
17
18	$len = strlen($str);
19	for($i = 0; $i < $len; $i++) {
20	$in = ord($str{$i});
21	if (0 == $mState) {
22	// When mState is zero we expect either a US-ASCII character or a
23	// multi-octet sequence.
24	if (0 == (0x80 & ($in))) {
25	// US-ASCII, pass straight through.
26	$out[] = $in;
27	$mBytes = 1;
28	} else if (0xC0 == (0xE0 & ($in))) {
29	// First octet of 2 octet sequence
30	$mUcs4 = ($in);
31	$mUcs4 = ($mUcs4 & 0x1F) << 6;
32	$mState = 1;
33	$mBytes = 2;
34	} else if (0xE0 == (0xF0 & ($in))) {
35	// First octet of 3 octet sequence
36	$mUcs4 = ($in);
37	$mUcs4 = ($mUcs4 & 0x0F) << 12;
38	$mState = 2;
39	$mBytes = 3;
40	} else if (0xF0 == (0xF8 & ($in))) {
41	// First octet of 4 octet sequence
42	$mUcs4 = ($in);
43	$mUcs4 = ($mUcs4 & 0x07) << 18;
44	$mState = 3;
45	$mBytes = 4;
46	} else if (0xF8 == (0xFC & ($in))) {
47	/* First octet of 5 octet sequence.
48	*
49	* This is illegal because the encoded codepoint must be either
50	* (a) not the shortest form or
51	* (b) outside the Unicode range of 0-0x10FFFF.
52	* Rather than trying to resynchronize, we will carry on until the end
53	* of the sequence and let the later error handling code catch it.
54	*/
55	$mUcs4 = ($in);
56	$mUcs4 = ($mUcs4 & 0x03) << 24;
57	$mState = 4;
58	$mBytes = 5;
59	} else if (0xFC == (0xFE & ($in))) {
60	// First octet of 6 octet sequence, see comments for 5 octet sequence.
61	$mUcs4 = ($in);
62	$mUcs4 = ($mUcs4 & 1) << 30;
63	$mState = 5;
64	$mBytes = 6;
65	} else {
66	/* Current octet is neither in the US-ASCII range nor a legal first
67	* octet of a multi-octet sequence.
68	*/
69	return false;
70	}
71	} else {
72	// When mState is non-zero, we expect a continuation of the multi-octet
73	// sequence
74	if (0x80 == (0xC0 & ($in))) {
75	// Legal continuation.
76	$shift = ($mState - 1) * 6;
77	$tmp = $in;
78	$tmp = ($tmp & 0x0000003F) << $shift;
79	$mUcs4 \|= $tmp;
80	if (0 == --$mState) {
81	/* End of the multi-octet sequence. mUcs4 now contains the final
82	* Unicode codepoint to be output
83	*
84	* Check for illegal sequences and codepoints.
85	*/
86	// From Unicode 3.1, non-shortest form is illegal
87	if (((2 == $mBytes) && ($mUcs4 < 0x0080)) \|\|
88	((3 == $mBytes) && ($mUcs4 < 0x0800)) \|\|
89	((4 == $mBytes) && ($mUcs4 < 0x10000)) \|\|
90	(4 < $mBytes) \|\|
91	// From Unicode 3.2, surrogate characters are illegal
92	(($mUcs4 & 0xFFFFF800) == 0xD800) \|\|
93	// Codepoints outside the Unicode range are illegal
94	($mUcs4 > 0x10FFFF)) {
95	return false;
96	}
97	if (0xFEFF != $mUcs4) {
98	// BOM is legal but we don't want to output it
99	$out[] = $mUcs4;
100	}
101	//initialize UTF8 cache
102	$mState = 0;
103	$mUcs4 = 0;
104	$mBytes = 1;
105	}
106	} else {
107	/* ((0xC0 & (*in) != 0x80) && (mState != 0))
108	*
109	* Incomplete multi-octet sequence.
110	*/
111	return false;
112	}
113	} # if state ...
114	} # for-loop
115	return $out;
116	}
117
118	# Liest alle Tasten, samt zugehörigen Zeichen aus der Referenz
119	function getNeoKeys () {
120	# Referenz laden
121	$reference = file_get_contents('http://neo-layout.org/svn/A-REFERENZ-A/neo20.txt');
122
123	# Haupttastatur finden
124	preg_match('/┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────────┐\n(.*)\n└──────┴──────┴──────┴──────────────────────────────────────┴──────┴──────┴──────┴──────┘/s', $reference, $found);
125	# Tastaturreihen aufspalten
126	$rows = preg_split('/\n├.*\n/', $found[1]);
127
128	$n = 1;
129	# Für jede Reihe:
130	foreach ($rows as $row) {
131	$sub_rows = preg_split('/\n/', $row);
132
133	# Finde Zeichen in beiden Zeilen.
134	# U030F ist ein combining-character und tritt zusammen mit einem Leerzeichen auf, damit es angezeigt wird
135	preg_match_all('/│(.) (\x{030F} \|.) (.)(?=│)/u', $sub_rows[0], $r1);
136	preg_match_all('/│(.) (.) (.)(?=│)/u', $sub_rows[1], $r2);
137
138	# Für jede Taste:
139	for ($i = 0; $i < count($r1[0]); $i++) {
140	# Überspringe Enter-Taste, die nicht dazugehört
141	if ($r1[2][$i] == "\xE2\x86\xB2" && $i == 11) {
142	$n--;
143	continue;
144	}
145	# Lade die Zeichen der beiden Zeilen in das Array, geordnet nach ihrer Ebene
146	$key[$n+$i.'_1'] = $r2[1][$i];
147	$key[$n+$i.'_2'] = $r1[1][$i];
148	$key[$n+$i.'_3'] = $r2[2][$i];
149	$key[$n+$i.'_4'] = $r1[2][$i];
150	$key[$n+$i.'_5'] = $r2[3][$i];
151	$key[$n+$i.'_6'] = $r1[3][$i];
152	}
153	$n += $i;
154	}
155	return $key;
156	}
157
158	# Nutzt utf8ToUnicode(), um einen Unicode-Codepoint zu finden, der dann hexadezimal formatiert zurückgegeben wird.
159	function getUnicodeCodepoint ($char) {
160	$codepoints = utf8ToUnicode($char);
161	if (!$codepoints[0]) return false;
162	return strtoupper(str_pad(dechex($codepoints[0]), 4, "0", STR_PAD_LEFT));
163	}
164
165
166
167	# Unicode Daten für die Zeichen-Benennung holen.
168	# ----------------------------------------------
169	ini_set('memory_limit', '128M');
170	# Sehr hoher Speicherbedarf, deshalb wäre es eigentlich besser UnicodeData in einer Datenbank zu haben
171	$ucdfull = file_get_contents('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
172	$ucdfull = explode("\n", $ucdfull);
173	# Array erstellen, um leichter auf die Daten zugreifen zu können
174	$ucd = array();
175	foreach ($ucdfull as $chardata) {
176	$chardata = explode(';', $chardata);
177	$ucd[$chardata[0]] = $chardata;
178	}
179
180	# Erzeugbare Zeichen suchen
181	# ---------------------------------------------
182
183	# Alle Tasten mit ihren Zeichen aus der Referenz laden
184	$keys = getNeoKeys();
185
186	# Diese Zeichen müssen noch angehäng werden, da die Leertaste nicht eingelesen wird
187	# Position der Zeichen ist irrelevant, siehe weiter unten
188	$keys[] = ' '; # SPACE
189	$keys[] = ' '; # NO-BREAK SPACE
190	$keys[] = ' '; # NARROW NO-BREAK SPACE
191
192	# Duplikate entfernen und alles sortieren
193	$chars = array_unique($keys);
194
195
196	sort($chars);
197
198	# Für jedes Zeichen die Kombination(en) auslesen, mit der/denen es erzeugt werden kann
199	# ------------------------------------------------
200
201	foreach ($chars as $char) {
202	$char = array('char' => $char);
203
204	# Ausnahmen, deren Kombinationen falsch sind, oder uns nicht interessieren:
205	# Diese Zeichen werden nicht erstellt, sondern sind in der Referenz nur zur Verdeutlichung da.
206	if (in_array($char['char'], array('⇞','⇟','⇠','⇡','⇢','⇣','⇱','⇲','⇥','⌦','⌧','⌫','↲','↶','⎀'))) continue; # Control-Keys
207	# Auch die Toten Tasten werden nicht normal erzeugt. Deren Zeichen können aber durch Auslesen der Compose gewonnen werden.
208	if (in_array($char['char'], array('˜','ˇ','¯','ˆ','˚','˘',"\xCC\x8F\x20",'`','¨','῾','¸','˝','˙','´','᾿'))) continue; # Dead-Keys
209
210	$char['codepoint'] = 'U+' . getUnicodeCodepoint($char['char']);
211	# Namen des Zeichens aus der UnicodeData auslesen
212	$char['name'] = $ucd[getUnicodeCodepoint($char['char'])][1];
213
214	# Da $keys mit den Positionen als Index angeordnet ist, bietet es sich an,
215	# das Array umzukehren und gleichzeitig nach dem entsprechenden Zeichen zu suchen
216	$char_positions = array_keys($keys, $char['char'], true);
217
218	$char['combos'] = array();
219	foreach ($char_positions as $pos) {
220	# Tastenposition aus Arraywert lesen
221	$keypos = substr($pos, 0, strpos($pos, '_'));
222	# Keine Toten Tasten absuchen, würde sonst z.B. "<Mod3> + <´>" für SOLIDUS finden
223	if (in_array($keypos, array('1', '13', '25'))) continue; # 1 ≙ T1; 13 ≙ T2; 25 ≙ T3
224
225	# Je nach Level werden unterschiedliche Modifier gebraucht
226	$level = substr($pos, -1);
227	switch ($level) {
228	case '1':
229	$combo = '<{{{' . $keys[$keypos . '_1'] . '}}}>';
230	break;
231	case '2':
232	$combo = '<Shift> + <{{{' . $keys[$keypos . '_1'] . '}}}>';
233	break;
234	case '3':
235	$combo = '<Mod3> + <{{{' . $keys[$keypos . '_1'] . '}}}>';
236	break;
237	case '4':
238	$combo = '<Mod4> + <{{{' . $keys[$keypos . '_1'] . '}}}>';
239	break;
240	case '5':
241	$combo = '<Shift> + <Mod3> + <{{{' . $keys[$keypos . '_1'] . '}}}>';
242	break;
243	case '6':
244	$combo = '<Mod3> + <Mod4> + <{{{' . $keys[$keypos . '_1'] . '}}}>';
245	break;
246
247	}
248	$char['combos'][] = $combo;
249
250	}
251	# Die Leertaste hat keine Positionierung, deshalb manuelle Namensvergabe
252	# Das Leerzeichen tritt in der Referenz leider mehrfach auf (an Stellen, die noch Leer sind):
253	if ($char['name'] == 'SPACE') $char['combos'] = array('<Leertaste>');
254	if ($char['name'] == 'DIGIT ZERO') $char['combos'][] = '<Mod4> + <Leertaste>';
255	if ($char['name'] == 'NO-BREAK SPACE') $char['combos'] = array('<Shift> + <Mod3> + <Leertaste>');
256	if ($char['name'] == 'NARROW NO-BREAK SPACE') $char['combos'] = array('<Mod3> + <Mod4> + <Leertaste>');
257
258	# FIXME Sortieren der Combos, einfachste zuerst.
259
260	# Zusammensetzen der Variablen und Ausgabe als wiki-formatierte Tabelle.
261	# ------------------------------------------------
262	echo '\|\|{{{' . $char['char'] . '}}}\|\|' . $char['codepoint'] . '\|\|' . $char['name'] . '\|\|' . implode('[[BR]]', $char['combos']) . '\|\|' . "\n";
263	}
264
265	?>

Download in other formats:

Original Format