Nástroje používateľa

Nástoje správy stránok


blog:odborny:2019-06-28-unicode_normalization_tool

Rozdiely

Tu môžete vidieť rozdiely medzi vybranou verziou a aktuálnou verziou danej stránky.

Odkaz na tento prehľad zmien

Obojstranná predošlá revíziaPredchádzajúca revízia
Nasledujúca revízia
Predchádzajúca revízia
blog:odborny:2019-06-28-unicode_normalization_tool [2019/11/10 03:39] – remove logging Róbert Tothblog:odborny:2019-06-28-unicode_normalization_tool [2025/01/20 20:02] (aktuálne) – [Source Text] Róbert Toth
Riadok 1: Riadok 1:
-====== Unicode Normalization Tool ======+====== Unicode Normalization & Analysis Tool ======
  
 Tool to detect and convert between different [[https://unicode.org/reports/tr15/|Unicode Normalization Forms]]. Tool to detect and convert between different [[https://unicode.org/reports/tr15/|Unicode Normalization Forms]].
  
  
-===== Source string ===== +===== Source Text =====
- +
-<php>?>+
  
 +<HTML>
 <script type="text/javascript"> <script type="text/javascript">
-(function(){+ 
 +function sleep(ms) { 
 +  return new Promise(resolve => setTimeout(resolve, ms)); 
 +
 +window.onload = function() { 
 +  sleep(1000).then(() => { 
 +    document.getElementById('utfSource').focus(); 
 +  }); 
 +  sleep(2000).then(() => { 
 +    document.getElementById('utfSource').focus(); 
 +  }); 
 +  sleep(3000).then(() => { 
 +    document.getElementById('utfSource').focus(); 
 +  }); 
   "use strict";   "use strict";
  
Riadok 30: Riadok 43:
     String.prototype.escapeHtml = escapeHtml;     String.prototype.escapeHtml = escapeHtml;
   }   }
-})();+};
  
 function utfRecalculate() { function utfRecalculate() {
Riadok 46: Riadok 59:
   var sourceForm = '<b>Source text encoding:</b> ';   var sourceForm = '<b>Source text encoding:</b> ';
   if (allForms == 0) {   if (allForms == 0) {
-    sourceForm += 'source text was not in Unicode.';+    sourceForm += 'not in single Unicode normalisation form (or not encoded in Unicode at all).';
   }   }
   else if (isNFD && isNFKD && allForms == 2) {   else if (isNFD && isNFKD && allForms == 2) {
Riadok 68: Riadok 81:
   var resultText = sourceText.normalize(requiredForm);   var resultText = sourceText.normalize(requiredForm);
   resultEl.value = resultText;   resultEl.value = resultText;
 +  
 +  // continue by doing UTF analysis
 +  utfAnalysis();
 } }
  
Riadok 82: Riadok 98:
 function utfAnalysis() { function utfAnalysis() {
   var unicode = unicode12_1_0;   var unicode = unicode12_1_0;
-  var sourceText = document.getElementById('utfResult').value;+  var sourceSelect = document.querySelector('input[name="utfAnalysisSource"]:checked').value; 
 +  var sourceText = (sourceSelect === 'original' 
 +                    ? document.getElementById('utfSource').value 
 +                    : document.getElementById('utfResult').value);
   var resultTextEl = document.getElementById('utfAnalysisText');   var resultTextEl = document.getElementById('utfAnalysisText');
   var resultCharEl = document.getElementById('utfAnalysisCharacters');   var resultCharEl = document.getElementById('utfAnalysisCharacters');
Riadok 89: Riadok 108:
   var analysisChars = '';   var analysisChars = '';
   var spottedChars = {};   var spottedChars = {};
 +  
 +  // count chars and words
 +  analysisText += '<div>Characters: ' + sourceText.length + '</div>';
 +  analysisText += '<div>Words: ' + (sourceText.match(/[^\s]+/g) || new Array()).length + '</div>';
 +  
 +  analysisText += '<div id="utfAnalysisText_content">';
      
   let iterator = sourceText[Symbol.iterator]();   let iterator = sourceText[Symbol.iterator]();
Riadok 96: Riadok 121:
     var codeHex = code.toString(16).padStart(4, '0').toUpperCase();     var codeHex = code.toString(16).padStart(4, '0').toUpperCase();
     var name = (unicode[codeHex] || '[unknown]').toLowerCase();     var name = (unicode[codeHex] || '[unknown]').toLowerCase();
-    analysisText += '<span data-title="'+index+': '+name+' (code: #'+code+'0x'+codeHex+')'+'">'+char.value+'</span>';+    analysisText += '<span data-title="#'+index+': '+name+'&#xa;  code '+code+' 0x'+codeHex+'">'+char.value+'</span>';
          
     if (code in spottedChars) {     if (code in spottedChars) {
Riadok 106: Riadok 131:
     index++;     index++;
   }   }
 +  
 +  analysisText += '</div>';
      
   // display text rundown   // display text rundown
Riadok 111: Riadok 138:
      
   // create character analysis   // create character analysis
-  analysisChars += '<thead><tr><th>Char</th><th>HTML code</th><th>Name</th><th>Count</th><th>Positions</th></tr></thead><tbody>';+  analysisChars += '<thead><tr><th>Char</th><th>Hex code</th><th>HTML code</th><th style="width:40%;">Name</th><th>Count</th><th>Positions</th></tr></thead><tbody>';
   for (var code in spottedChars) {   for (var code in spottedChars) {
     var codeHex = parseInt(code, 10).toString(16).padStart(4, '0').toUpperCase();     var codeHex = parseInt(code, 10).toString(16).padStart(4, '0').toUpperCase();
Riadok 117: Riadok 144:
     var char = String.fromCodePoint(code);     var char = String.fromCodePoint(code);
     var name = (unicode[codeHex] || '[unknown]');     var name = (unicode[codeHex] || '[unknown]');
-    analysisChars += '<tr><td>'+char+'</td><td>&amp;#'+code+';</td><td>'+name.escapeHtml()+'</td><td>'+positions.length+'</td><td>'+positions.join(" ")+'</td></tr>';+    analysisChars += '<tr><td>'+char+'</td><td>0x'+codeHex+'</td><td>&amp;#'+code+';</td><td>'+name.escapeHtml()+'</td><td>'+positions.length+'</td><td><div class="limitHeight">'+positions.join(" ")+'</div></td></tr>';
   }   }
      
Riadok 126: Riadok 153:
 </script> </script>
  
-<textarea id='utfSource' oninput='utfRecalculate();utfAnalysis();' style='width:100%; font-family:Consolas, "Andale Mono WT", "Andale Mono", "Bitstream Vera Sans Mono", "Nimbus Mono L", Monaco, "Courier New", monospace; font-size:11px;' rows='25' cols='120'></textarea> 
- 
-<div id='utfSourceForm'><b>Source text encoding:</b> undetermined</div></br> 
- 
-<dl> 
-  <dt><input type='radio' name='utfForm' onchange='utfRecalculate()' value='NFD'> NFD</dt> 
-  <dd>Canonical Decomposition</dd> 
-  <dt><input type='radio' name='utfForm' onchange='utfRecalculate()' value='NFC' checked='checked'> NFC</dt> 
-  <dd>Canonical Decomposition, followed by Canonical Composition</br></dd> 
-  <dt><input type='radio' name='utfForm' onchange='utfRecalculate()' value='NFKD'> NFKD</dt> 
-  <dd>Compatibility Decomposition</dd> 
-  <dt><input type='radio' name='utfForm' onchange='utfRecalculate()' value='NFKC'> NFKC</dt> 
-  <dd>Compatibility Decomposition, followed by Canonical Composition</dd> 
-</dl> 
- 
-<?php</php> 
- 
- 
-===== Result ===== 
- 
-<php>?> 
- 
-<textarea id='utfResult' style='width:100%; font-family:Consolas, "Andale Mono WT", "Andale Mono", "Bitstream Vera Sans Mono", "Nimbus Mono L", Monaco, "Courier New", monospace; font-size:11px;' rows='25' cols='120'></textarea> 
- 
-<?php</php> 
- 
-==== Text rundown ==== 
- 
-<php>?> 
  
 <style type="text/css"> <style type="text/css">
-  #utfAnalysisText > span {+  #utfAnalysisText 
 +    clear: both; 
 +  } 
 +  #utfAnalysisText_content { 
 +    margin: 1.5em 0.5em; 
 +    float: left; 
 +  } 
 +  #utfAnalysisText_content > span {
     position: relative;     position: relative;
     display: inline-block;     display: inline-block;
Riadok 168: Riadok 173:
     padding-bottom:4px;     padding-bottom:4px;
     vertical-align: bottom;     vertical-align: bottom;
 +    float: left;
     /*cursor: nw-resize; /* nw-resize or text or crosshair */     /*cursor: nw-resize; /* nw-resize or text or crosshair */
   }   }
-  #utfAnalysisText > span:first-of-type {+  #utfAnalysisText_content > span:first-of-type {
     box-shadow: -2px -2px 2px rgb(127, 43, 1);     box-shadow: -2px -2px 2px rgb(127, 43, 1);
   }   }
-  #utfAnalysisText > span:last-of-type {+  #utfAnalysisText_content > span:last-of-type {
     box-shadow: 2px 2px 2px rgb(127, 43, 1);     box-shadow: 2px 2px 2px rgb(127, 43, 1);
   }   }
-  #utfAnalysisText > span:hover {+  #utfAnalysisText_content > span[data-title$="0x000A"] { 
 +    border-right: 2px solid rgb(127, 43, 1); 
 +    border-top-right-radius: 50%; 
 +  } 
 +  #utfAnalysisText_content > span[data-title$="0x000A"] + span { 
 +    clear: left; 
 +  } 
 +  #utfAnalysisText_content > span:hover {
     background-color: rgba(255, 0, 0, 0.5);     background-color: rgba(255, 0, 0, 0.5);
     padding-bottom:0px;     padding-bottom:0px;
Riadok 182: Riadok 195:
     overflow-x: visible;     overflow-x: visible;
   }   }
-  #utfAnalysisText > span:hover::after {+  #utfAnalysisText_content > span:hover::after {
     content: attr(data-title);     content: attr(data-title);
     position: absolute;     position: absolute;
Riadok 188: Riadok 201:
     top: 30px;     top: 30px;
     z-index: 99;     z-index: 99;
-    padding: 5px;+    padding: 5px 8px 5px 5px;
     min-width:150px;     min-width:150px;
 +    width: max-content;
 +    max-width: 250px;
     background-color: rgba(0, 0, 0, 0.85);     background-color: rgba(0, 0, 0, 0.85);
     color: rgb(206, 199, 140);     color: rgb(206, 199, 140);
Riadok 195: Riadok 210:
     text-align: left;     text-align: left;
     text-transform: capitalize;     text-transform: capitalize;
 +  }
 +  
 +  div.limitHeight {
 +    overflow: auto;
 +  }
 +  div.limitHeight:hover {
 +    max-height: fit-content !important;
 +  }
 +  #utfAnalysisCharacters div.limitHeight {
 +    max-height: 3.5em;
   }   }
 </style> </style>
  
-<div id='utfAnalysisText' style='width:100%; font-family:Consolas, "Andale Mono WT", "Andale Mono", "Bitstream Vera Sans Mono", "Nimbus Mono L", Monaco, "Courier New", monospace; font-size:11px;'></div>+<textarea id='utfSource' oninput='utfRecalculate()' style='width:100%; font-family:Consolas, "Andale Mono WT", "Andale Mono", "Bitstream Vera Sans Mono", "Nimbus Mono L", Monaco, "Courier New", monospace; font-size:11px;' rows='15' cols='120' autofocus></textarea>
  
-<?php</php>+<div id='utfSourceForm'><b>Source text encoding:</b> undetermined</div> 
 + 
 +</HTML> 
 + 
 + 
 +===== Normalized Text ===== 
 + 
 +<HTML> 
 + 
 +<div style="float:left; width:49%;"> 
 +  <dl> 
 +    <dt><input type="radio" name="utfForm" onchange="utfRecalculate()" value="NFD"> NFD</dt> 
 +    <dd>Canonical Decomposition</dd> 
 +    <dt><input type="radio" name="utfForm" onchange="utfRecalculate()" value="NFC" checked="checked"> NFC</dt> 
 +    <dd>Canonical Decomposition, followed by Canonical Composition</dd> 
 +  </dl> 
 +</div> 
 +<div style="float:right; width:49%;"> 
 +  <dl> 
 +    <dt><input type="radio" name="utfForm" onchange="utfRecalculate()" value="NFKD"> NFKD</dt> 
 +    <dd>Compatibility Decomposition</dd> 
 +    <dt><input type="radio" name="utfForm" onchange="utfRecalculate()" value="NFKC"> NFKC</dt> 
 +    <dd>Compatibility Decomposition, followed by Canonical Composition</dd> 
 +  </dl> 
 +</div> 
 + 
 +<textarea id='utfResult' style='width:100%; font-family:Consolas, "Andale Mono WT", "Andale Mono", "Bitstream Vera Sans Mono", "Nimbus Mono L", Monaco, "Courier New", monospace; font-size:11px;' rows='10' cols='120' readonly="readonly"></textarea> 
 + 
 +</HTML> 
 + 
 +===== Text Analysis ===== 
 + 
 +<HTML> 
 + 
 +<div style="float:left; width:49%;"> 
 +  <dl> 
 +    <dt><input type="radio" name="utfAnalysisSource" onchange="utfAnalysis()" value="original" checked="checked"> Source text</dt> 
 +    <dd>Analyse original string as entered</dd> 
 +    <dt><input type="radio" name="utfAnalysisSource" onchange="utfAnalysis()" value="normalised"> Result text</dt> 
 +    <dd>Analyse the resulting string after normalisation</dd> 
 +  </dl> 
 +</div> 
 + 
 +</HTML> 
 + 
 +==== Text Rundown ==== 
 + 
 +<HTML> 
 +<div id='utfAnalysisText' style='width:100%; font-family:Consolas, "Andale Mono WT", "Andale Mono", "Bitstream Vera Sans Mono", "Nimbus Mono L", Monaco, "Courier New", monospace; font-size:11px;'></div> 
 +</HTML>
  
  
-==== Analysis of Characters ====+==== Statistics of Characters ====
  
-<php>?>+<HTML>
  
 <table id='utfAnalysisCharacters' class="inline" style='width:100%; font-family:Consolas, "Andale Mono WT", "Andale Mono", "Bitstream Vera Sans Mono", "Nimbus Mono L", Monaco, "Courier New", monospace; font-size:10px;'></table> <table id='utfAnalysisCharacters' class="inline" style='width:100%; font-family:Consolas, "Andale Mono WT", "Andale Mono", "Bitstream Vera Sans Mono", "Nimbus Mono L", Monaco, "Courier New", monospace; font-size:10px;'></table>
  
-<?php</php>+</HTML>
  
  
Riadok 217: Riadok 291:
  
 ~~socialite~~ ~~socialite~~
-{{tag>tools}}+{{tag>tools Unicode UTF-8}}
  
  
Riadok 224: Riadok 298:
 ~~DISQUS~~ ~~DISQUS~~
  
-<php>?>+<HTML>
 <script type="text/javascript"> <script type="text/javascript">
  
Riadok 33075: Riadok 33149:
  
 </script> </script>
-<?php</php>+</HTML>
  
blog/odborny/2019-06-28-unicode_normalization_tool.1573353568.txt.gz · Posledná úprava: 2019/11/10 03:39 od Róbert Toth