Bonjour,
Je suis chercheur et j’ai besoin de récupérer des données timées (qui écrit quoi et quand) à des fin d’analyse.
Y a t-il un moyen pour récupérer rapidement l’ensemble des données de l’historique dynamique ? (sous forme de tableau excel par exemple ?)
Merci à tous !
Bjr,
Et quel a été le retour d’expérience suite a la réponse de @PaliPalo en juillet dernier ?
++
Merci @Geppetto pour me rendre hommage
En outre, fut un temps, j’avais regardé à un moyen de sortir ce genre d’info sous un format CSV. Mais je n’ai pas vraiment pris le temps de faire des tests pour voir si ça fonctionnait bien.
Bref, voici, le script Python, que j’avais réadapté de la version avec interface de navigation, pour sortir une liste sous format CSV. En espérant que ça puisse aider mais sans garantie que ce soit nickel-chrome.
"""
EtherpadHistory helps listings changes done on a Etherpad text.
Copyright (C) 2024 Pali Palo
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from datetime import datetime
import json
import os
import sys
import shlex
import struct
import platform
import subprocess
# This code build a system dependent alias
# for clearing entire terminal screen
def get_clear():
current_os = platform.system()
if current_os == 'Windows':
return lambda: os.system('cls')
elif current_os in ['Linux', 'Darwin'] or current_os.startswith('CYGWIN'):
return lambda: os.system('clear')
else :
return lambda: os.system('cls')
clear = get_clear()
# The following code is used to get terminal attributes under Linux,
# Mac and Windows. (Hope this really works)
# This has been found here https://gist.github.com/jtriley/1108174
def get_terminal_size():
""" getTerminalSize()
- get width and height of console
- works on linux,os x,windows,cygwin(windows)
originally retrieved from:
http://stackoverflow.com/questions/566746/how-to-get-console-window-width-in-python
"""
current_os = platform.system()
tuple_xy = None
if current_os == 'Windows':
tuple_xy = _get_terminal_size_windows()
if tuple_xy is None:
tuple_xy = _get_terminal_size_tput()
# needed for window's python in cygwin's xterm!
if current_os in ['Linux', 'Darwin'] or current_os.startswith('CYGWIN'):
tuple_xy = _get_terminal_size_linux()
if tuple_xy is None:
tuple_xy = (80, 25) # default value
return tuple_xy
def _get_terminal_size_windows():
try:
from ctypes import windll, create_string_buffer
# stdin handle is -10
# stdout handle is -11
# stderr handle is -12
h = windll.kernel32.GetStdHandle(-12)
csbi = create_string_buffer(22)
res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
if res:
(bufx, bufy, curx, cury, wattr,
left, top, right, bottom,
maxx, maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw)
sizex = right - left + 1
sizey = bottom - top + 1
return sizex, sizey
except:
pass
def _get_terminal_size_tput():
# get terminal width
# src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width-height-of-a-terminal-window
try:
cols = int(subprocess.check_call(shlex.split('tput cols')))
rows = int(subprocess.check_call(shlex.split('tput lines')))
return (cols, rows)
except:
pass
def _get_terminal_size_linux():
def ioctl_GWINSZ(fd):
try:
import fcntl
import termios
cr = struct.unpack('hh',
fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))
return cr
except:
pass
cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
if not cr:
try:
fd = os.open(os.ctermid(), os.O_RDONLY)
cr = ioctl_GWINSZ(fd)
os.close(fd)
except:
pass
if not cr:
try:
cr = (os.environ['LINES'], os.environ['COLUMNS'])
except:
return None
return int(cr[1]), int(cr[0])
console_columns = get_terminal_size() [0]
# This code is used to get user input one char at a time
# without requesting the user to validate it with enter key
def _find_getch():
try:
import termios
except ImportError:
# Non-POSIX. Return msvcrt's (Windows') getch.
import msvcrt
return msvcrt.getch
# POSIX system. Create and return a getch that manipulates the tty.
import sys, tty
def _getch():
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setraw(fd)
ch = sys.stdin.read(1)
finally:
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
return ch
return _getch
getch = _find_getch()
# This code is used to decode base36 integer value found
# in the changeset.
# Base36 is a numeric system such this :
# 0, 1, 2, ..., 9, a, b, c, ..., z, 10, 11, .., 19, 1a, ...
def base36decode ( number ) :
return int ( number, 36 )
# This code is used to get the integer value for a command
# in a changeset
def get_changeset_value( changes_string, i ):
allowed_commands = ':><+-=|*'
value = ''
while i < len( changes_string ) and allowed_commands.find( changes_string[i] ) == -1 :
value += changes_string[i]
i += 1
return base36decode( value ), i
# This code is used to apply the changes on the previous
# version of the text, by following the commands found
# in the changset.
# Its output its a displayable text which have certain color;
# and the text modified accordingly to changeset. This allows
# caller to display a visual changes to the text and reroll
# the text for the next changeset.
def apply_pad_revision( revision_ID, text ) :
global revisions
tokens = revisions[ revision_ID ][ 'changeset' ].partition( '$' )
changes_string = tokens[0]
original_text = text
original_text_pos = 0
revision_text = tokens[2]
revision_text_pos = 0
modified_text = ''
modified_text_pos = 0
displayed_text = ''
displayed_text_pos = 0
i = 1 # 1 instead 0 to bypass leading Z char which is only a magic byte
while i < len( changes_string ) :
command = changes_string[i]
value, i = get_changeset_value( changes_string, i + 1 )
# print ( "Command : " + command + ' ' + str( value ) )
if command == ':' or command == '>' or command == '<' :
# ':' means previous revision text length
# '>' means number of char added to previous revision text
# '<' means number of char removed from previous revision text
# We do not need this information here, so pass it
pass
elif command == '+' :
# '+' means that X characters are to be added from this actual cursor
# within previous revision text.
modified_text += revision_text [ revision_text_pos : revision_text_pos + value ]
modified_text_pos += value
revision_text_part = revision_text [ revision_text_pos : revision_text_pos + value ]
if revision_text_part.endswith( '\n' ) :
revision_text_part = revision_text_part[ 0: len( revision_text_part ) - 1 ] + ' \n'
displayed_text += '\33[42;1m' + revision_text_part + '\33[0m'
displayed_text_pos += len( revision_text_part ) + len ( '\33[42;1m' + '\33[0m' )
revision_text_pos += value
# original_text_pos is unchanged since it is a new inserted text
elif command == '-' :
# modified_text_pos is unchanged
original_text_part = original_text [ original_text_pos : original_text_pos + value ]
if original_text_part.endswith( '\n' ) :
original_text_part = original_text_part[ 0: len( original_text_part ) - 1 ] + ' \n'
displayed_text += '\33[41;1m' + original_text_part + '\33[0m'
displayed_text_pos += len( original_text_part ) + len ( '\33[41;1m' + '\33[0m' )
original_text_pos += value
elif command == '=' :
# modified_text += original_text [ original_text_pos : original_text_pos + value ]
# modified_text_pos += value
# displayed_text += original_text [ original_text_pos : original_text_pos + value ]
# displayed_text_pos += value
original_text_pos += value
elif command == '|' :
sub_command = changes_string[i]
sub_value, i = get_changeset_value( changes_string, i + 1 )
# print( "Subcommand : " + sub_command + ' ' + str( sub_value ) )
if sub_command == '+' :
# '+' means that <sub_value> characters are to be added from this actual cursor
# within previous revision text and that added text contains <value> newline chars.
modified_text += revision_text [ revision_text_pos : revision_text_pos + sub_value ]
modified_text_pos += sub_value
revision_text_part = revision_text [ revision_text_pos : revision_text_pos + sub_value ]
if revision_text_part.endswith( '\n' ) :
revision_text_part = revision_text_part[ 0: len( revision_text_part ) - 1 ] + ' \n'
displayed_text += '\33[42;1m' + revision_text_part + '\33[0m'
displayed_text_pos += len( revision_text_part ) + len ( '\33[42;1m' + '\33[0m' )
revision_text_pos += sub_value
# original_text_pos is unchanged
elif sub_command == '-' :
# modified_text_pos is unchanged
original_text_part = original_text [ original_text_pos : original_text_pos + sub_value ]
if original_text_part.endswith( '\n' ) :
original_text_part = original_text_part[ 0: len( original_text_part ) - 1 ] + ' \n'
displayed_text += '\33[41;1m' + original_text_part + '\33[0m'
displayed_text_pos += len( original_text_part ) + len ( '\33[41;1m' + '\33[0m' )
original_text_pos += sub_value
elif sub_command == '=' :
# modified_text += original_text [ original_text_pos : original_text_pos + sub_value ]
# modified_text_pos += sub_value
# displayed_text += original_text [ original_text_pos : original_text_pos + sub_value ]
# displayed_text_pos += sub_value
original_text_pos += sub_value
elif command == '*' :
# Applying attributes to following changes. Generally this set
# the author of the change, but this in fact the same
# as the changeset author found in meta entry. So this is strictly
# ignored here. But this may reflect the font face or style change
# (bold, italic, color, etc.)
pass
displayed_text += original_text[ original_text_pos : ]
modified_text += original_text[ original_text_pos : ]
return displayed_text, modified_text
if __name__ == "__main__":
# handle script parameter
if len( sys.argv ) != 2 :
print( "Please give one etherpad file name as argument for this script" )
quit()
# open the etherpad (which is indeed a json export of a variable)
with open( sys.argv[1] ) as json_file :
document = json.load(json_file)
# First, extract only revisions from Etherpad data.
# (In meantime, build the authors list to ease display later)
revisions = dict()
authors = dict()
for entry_ID in document :
tokens = entry_ID.split( ':' )
if entry_ID.startswith( 'pad:' ) :
# For all pad entries, tokens[0] = "pad" and tokens[1] = pad ID.
# For revisionned pad entries, tokens[2] = "revs" and tokens[3] =
# revision sequential number
# (there might be several pads by design but not in this case)
if len( tokens ) == 4 and tokens[2] == 'revs' :
revisions[ tokens[3] ] = document[ entry_ID ]
if entry_ID.startswith( 'globalAuthor:' ) :
authors[ tokens[1] ] = document[ entry_ID ][ 'name' ]
# Then, parsed the revisions sorted by sequential revision number
# and change the text one step at a time.s
history = dict()
text = '' # by default there is no text yet
for revision_ID in sorted( revisions, key=int ) :
title = ''
if revision_ID == '0' :
title = '0' + revision_ID + ' (original version)'
else :
revision_info = revisions[ revision_ID ][ 'meta' ]
author = authors[ revision_info[ 'author' ] ]
timestamp = datetime.fromtimestamp( revision_info[ 'timestamp' ] / 1000 ).isoformat()
displayed_text, text = apply_pad_revision( revision_ID, text )
title = revision_ID + ',' + author + ',' + timestamp + ',' + displayed_text
print( title )
Merci +++ à vous deux !!!
Je teste cela et vous tiens au courant.
Bien à vous JL
Bjr Gepetto,
Je n’avais pas vu la réponse de juillet… Merci à vous deux. Je teste cela!