ajoute scriptt renomme ・ 4436266 ・ Gitprep

+194

renomme

...	...	@@ -0,0 +1,194 @@
	1	+#!/bin/bash
	2	+
	3	+# script pour renommer des personnages ou lieux dans un document Manuskript
	4	+# ./renomme <document manuskript> <ancien nom> <nouveau nom> [ins=poids] [del=poids] [rep=poids]
	5	+#
	6	+# Le document d'origine est sauvegardé (voir variable $backup)
	7	+#
	8	+# Si <nouveau nom> est "check", ou "prox" ou "leven", une étude de proximité est alors effectuée
	9	+# sur l'algorithme Levenshtein. Les paramètres de poids sont des nombres entiers, et permettent
	10	+# de pondérer l'ajout (ins=), la suppression (del=) et le remplacement (rep=) de caractère.
	11	+# par défaut chacun des trois paramètres est égal à 1.
	12	+# Il n'y a pas d'ordre obligatoire pour le paramétrage, et si un paramétrage est effectué plusieurs fois
	13	+# c'est le plus à gauche qui prend la priorité. Il n'y a pas de backup effectué pour l'opération de vérification de la proximité
	14	+# crédit algo: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance
	15	+
	16	+set -e
	17	+
	18	+manuscrit=${1:?}
	19	+ancien=${2:?}
	20	+nouveau=${3:?}
	21	+
	22	+if test "${manuscrit:0:1}" != '/'; then
	23	+ manuscrit="$PWD/$manuscrit"
	24	+fi
	25	+test -r "$manuscrit" \|\| exit 1
	26	+test $(file --brief --mime-type --dereference "$manuscrit") == 'application/zip' \|\| exit 2
	27	+
	28	+backup="${manuscrit/%.msk} (avant renommage de «${ancien}» en «${nouveau}»).msk"
	29	+
	30	+function trap_exit () {
	31	+ rm -fr $temp
	32	+ cd - > /dev/null
	33	+}
	34	+
	35	+function determinant () {
	36	+ eval "local nom=\"\$$1\""
	37	+ if [[ ${nom:0:1} == @(A\|E\|H\|I\|O\|U) \
	38	+ \|\| ${nom:0:1} == @(Â\|Ê\|H\|Î\|Ô\|Û) \
	39	+ \|\| ${nom:0:1} == @(Ä\|Ë\|H\|Ï\|Ö\|Ü) \
	40	+ \|\| ${nom:0:1} == @(À\|È\|H\|Ì\|Ò\|Ù) \
	41	+ \|\| ${nom:0:1} == @(Á\|É\|H\|Í\|Ó\|Ú) \
	42	+ ]]; then
	43	+ eval "determinant_$1=\"d'\""
	44	+ eval "determinant_$1_formatted=\"d-\""
	45	+ else
	46	+ eval "determinant_$1=\"de \""
	47	+ eval "determinant_$1_formatted=\"de-\""
	48	+ fi
	49	+}
	50	+
	51	+function format () {
	52	+ eval "$1_formatted=\$(tr --complement --squeeze-repeats 'A-Za-z-_\n' - <<< \"\${$1// /_}\")"
	53	+}
	54	+
	55	+function renomme_fichiers () {
	56	+ for char in $(find characters -type f -regex "characters/[0-9]+-$1.txt"); do
	57	+ local new_char=$(sed "s/$1/$2/" <<< $char)
	58	+ echo "character: $char -> $new_char"
	59	+ mv $char $new_char
	60	+ break
	61	+ done
	62	+
	63	+ for chapter in $(find outline -type d -regex "outline/.$1."); do
	64	+ local new_chapter=$(sed "s/$1/$2/g" <<< $chapter)
	65	+ echo "chapter: $chapter -> $new_chapter"
	66	+ mv $chapter $new_chapter
	67	+ done
	68	+
	69	+ for part in $(find outline -type f -regex "outline/[^/]$1[^/].md"); do
	70	+ local new_part=$(sed "s/$1/$2/g" <<< $part)
	71	+ echo "part: $part -> $new_part"
	72	+ mv $part $new_part
	73	+ done
	74	+}
	75	+
	76	+trap trap_exit EXIT
	77	+
	78	+temp=$(mktemp --dry-run /dev/shm/XXXXXXXXX)
	79	+mkdir $temp
	80	+cd $temp
	81	+
	82	+if [[ $nouveau = @(check\|prox\|leven) ]]; then
	83	+ unzip -qq "$manuscrit"
	84	+ for param in $(seq 4 $#); do
	85	+ for dst in ins del rep; do
	86	+ eval "if test -n '\$$param' && [[ \"\$$param\" =~ $dst=[0-9]+ ]]; then cost_$dst=\${$param#*=}; fi"
	87	+ done
	88	+ done
	89	+ echo paramètres d\'approximation
	90	+ echo "caractère manquant (del=): ${cost_del:-1}"
	91	+ echo "caractère inséré (ins=): ${cost_ins:-1}"
	92	+ echo "caractère remplacé (rep=): ${cost_rep:-1}"
	93	+ for f in $(find . -type f); do
	94	+ let wc+=$(wc -w < $f)
	95	+ done
	96	+ awk -v ancien=$ancien -v wc=$wc -v cost_ins=${cost_ins:-1} -v cost_del=${cost_del:-1} -v cost_rep=${cost_rep:-1} '
	97	+ BEGIN {
	98	+ RS="[[:punct:]]"
	99	+ progress_mod = 10
	100	+ actual_progress = 0
	101	+ pct_progress = 0
	102	+ progress = 0
	103	+ found_words = 0
	104	+
	105	+ str1_len = length(ancien)
	106	+ for (i=1; i<=str1_len; i++)
	107	+ str1_substr[i]=substr(ancien, i, 1)
	108	+ }
	109	+ function levenshtein(str2) {
	110	+ str2_len = length(str2)
	111	+ if(str2_len == 0) return str1_len * cost_del
	112	+ for(j = 1; j <= str2_len; j++)
	113	+ str2_substr[j]=substr(str2, j, 1)
	114	+ matrix[0, 0] = 0
	115	+ for(i = 1; i <= str1_len; i++) {
	116	+ matrix[i, 0] = i * cost_del
	117	+ for(j = 1; j <= str2_len; j++) {
	118	+ matrix[0, j] = j * cost_ins
	119	+ x = matrix[i - 1, j] + cost_del
	120	+ y = matrix[i, j - 1] + cost_ins
	121	+ z = matrix[i - 1, j - 1] + (str1_substr[i] == str2_substr[j] ? 0 : cost_rep)
	122	+ x = x < y ? x : y
	123	+ matrix[i, j] = x < z ? x : z
	124	+ }
	125	+ }
	126	+ return matrix[str1_len, str2_len]
	127	+ }
	128	+ {
	129	+ for (word=1; word<=NF; word++) {
	130	+ progress++
	131	+ lvstn = levenshtein(gensub("[[:punct:]]","","g",$word))
	132	+ if (lvstn <= 3 && lvstn > 0) {
	133	+ approx_possibles[$word]++
	134	+ found_words++
	135	+ }
	136	+ pct_progress=int(progress / wc * 100)
	137	+ if (actual_progress < pct_progress && pct_progress % progress_mod == 0) {
	138	+ actual_progress = pct_progress
	139	+ printf("%i%\n", actual_progress)
	140	+ }
	141	+ }
	142	+ }
	143	+ END {
	144	+ if (found_words > 0) {
	145	+ printf("mot%s proche%s de «%s» [occurences]\n", found_words > 0 ? "s" : "", found_words > 0 ? "s" : "", ancien)
	146	+ for (i in approx_possibles)
	147	+ printf("- %s [%i]\n", i, approx_possibles[i])
	148	+ }
	149	+ else {
	150	+ print "aucun mot proche et différent de «" ancien "» trouvé"
	151	+ }
	152	+ }
	153	+ ' $(find . -type f)
	154	+ exit
	155	+fi
	156	+
	157	+mv --backup=numbered "$manuscrit" "$backup"
	158	+
	159	+unzip -qq "$backup"
	160	+
	161	+for version in ancien nouveau; do
	162	+ format $version
	163	+ determinant $version
	164	+done
	165	+
	166	+declare -A remplacement
	167	+remplacement=(
	168	+ [$ancien]="$nouveau"
	169	+ [$ancien_formatted]="$nouveau_formatted"
	170	+ [$determinant_ancien$ancien]="$determinant_nouveau$nouveau"
	171	+ [$determinant_ancien_formatted$ancien_formatted]="$determinant_nouveau_formatted$nouveau_formatted"
	172	+)
	173	+
	174	+renomme_fichiers "$ancien_formatted" "$nouveau_formatted"
	175	+renomme_fichiers "$determinant_ancien_formatted$ancien_formatted" "$determinant_nouveau_formatted$nouveau_formatted"
	176	+
	177	+egrep --word-regexp --only-matching --recursive --regexp="($determinant_ancien\|$determinant_ancien_formatted)\?($ancien\|$ancien_formatted)" . \
	178	+\| awk -v name="$1" -F ':' '
	179	+ {
	180	+ if ($NF > 0) {
	181	+ file[$1]++
	182	+ nb++
	183	+ }
	184	+ }
	185	+ END {
	186	+ printf("remplacement de %i occurences pour %s dans %i fichiers\n", nb, name, asort(file))
	187	+ }'
	188	+
	189	+for regexp in "${!remplacement[@]}"; do
	190	+ egrep --word-regexp --files-with-matches --recursive --regexp="$regexp" . \
	191	+ \| xargs --no-run-if-empty sed --regexp-extended --in-place "s/(\W\|^)$regexp(\W\|$)/\1${remplacement[$regexp]}\2/g"
	192	+done
	193	+
	194	+zip --recurse-paths --no-dir-entries -qq "${manuscrit}" *