Comment récupérer rapidement les données de l'historique dynamique?

JChauvin5 · Novembre 21, 2024, 8:29

Bonjour,
Je suis chercheur et j’ai besoin de récupérer des données timées (qui écrit quoi et quand) à des fin d’analyse.
Y a t-il un moyen pour récupérer rapidement l’ensemble des données de l’historique dynamique ? (sous forme de tableau excel par exemple ?)
Merci à tous !

Geppetto · Novembre 21, 2024, 8:50

Bjr,
Et quel a été le retour d’expérience suite a la réponse de @PaliPalo en juillet dernier ?

++

PaliPalo · Novembre 21, 2024, 9:05

Merci @Geppetto pour me rendre hommage

En outre, fut un temps, j’avais regardé à un moyen de sortir ce genre d’info sous un format CSV. Mais je n’ai pas vraiment pris le temps de faire des tests pour voir si ça fonctionnait bien.

Bref, voici, le script Python, que j’avais réadapté de la version avec interface de navigation, pour sortir une liste sous format CSV. En espérant que ça puisse aider mais sans garantie que ce soit nickel-chrome.

"""
    EtherpadHistory helps listings changes done on a Etherpad text.
    
    Copyright (C) 2024  Pali Palo

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

from datetime import datetime
import json
import os
import sys
import shlex
import struct
import platform
import subprocess

# This code build a system dependent alias 
# for clearing entire terminal screen
def get_clear():
    current_os = platform.system()
    if current_os == 'Windows':
        return lambda: os.system('cls')
    elif current_os in ['Linux', 'Darwin'] or current_os.startswith('CYGWIN'):
        return lambda: os.system('clear')
    else :
        return lambda: os.system('cls')

clear = get_clear()

# The following code is used to get terminal attributes under Linux,
# Mac and Windows. (Hope this really works)
# This has been found here https://gist.github.com/jtriley/1108174
 
def get_terminal_size():
    """ getTerminalSize()
     - get width and height of console
     - works on linux,os x,windows,cygwin(windows)
     originally retrieved from:
     http://stackoverflow.com/questions/566746/how-to-get-console-window-width-in-python
    """
    current_os = platform.system()
    tuple_xy = None
    if current_os == 'Windows':
        tuple_xy = _get_terminal_size_windows()
        if tuple_xy is None:
            tuple_xy = _get_terminal_size_tput()
            # needed for window's python in cygwin's xterm!
    if current_os in ['Linux', 'Darwin'] or current_os.startswith('CYGWIN'):
        tuple_xy = _get_terminal_size_linux()
    if tuple_xy is None:
        tuple_xy = (80, 25)      # default value
    return tuple_xy
 
def _get_terminal_size_windows():
    try:
        from ctypes import windll, create_string_buffer
        # stdin handle is -10
        # stdout handle is -11
        # stderr handle is -12
        h = windll.kernel32.GetStdHandle(-12)
        csbi = create_string_buffer(22)
        res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
        if res:
            (bufx, bufy, curx, cury, wattr,
             left, top, right, bottom,
             maxx, maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw)
            sizex = right - left + 1
            sizey = bottom - top + 1
            return sizex, sizey
    except:
        pass
 
def _get_terminal_size_tput():
    # get terminal width
    # src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width-height-of-a-terminal-window
    try:
        cols = int(subprocess.check_call(shlex.split('tput cols')))
        rows = int(subprocess.check_call(shlex.split('tput lines')))
        return (cols, rows)
    except:
        pass
 
def _get_terminal_size_linux():
    def ioctl_GWINSZ(fd):
        try:
            import fcntl
            import termios
            cr = struct.unpack('hh',
                               fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))
            return cr
        except:
            pass
    cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
    if not cr:
        try:
            fd = os.open(os.ctermid(), os.O_RDONLY)
            cr = ioctl_GWINSZ(fd)
            os.close(fd)
        except:
            pass
    if not cr:
        try:
            cr = (os.environ['LINES'], os.environ['COLUMNS'])
        except:
            return None
    return int(cr[1]), int(cr[0])

console_columns = get_terminal_size() [0]

# This code is used to get user input one char at a time
# without requesting the user to validate it with enter key

def _find_getch():
    try:
        import termios
    except ImportError:
        # Non-POSIX. Return msvcrt's (Windows') getch.
        import msvcrt
        return msvcrt.getch

    # POSIX system. Create and return a getch that manipulates the tty.
    import sys, tty
    def _getch():
        fd = sys.stdin.fileno()
        old_settings = termios.tcgetattr(fd)
        try:
            tty.setraw(fd)
            ch = sys.stdin.read(1)
        finally:
            termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
        return ch

    return _getch

getch = _find_getch()

# This code is used to decode base36 integer value found
# in the changeset.
# Base36 is a numeric system such this : 
#   0, 1, 2, ..., 9, a, b, c, ..., z, 10, 11, .., 19, 1a, ...

def base36decode ( number ) :
    return int ( number, 36 )

# This code is used to get the integer value for a command
# in a changeset
def get_changeset_value( changes_string, i ):
    allowed_commands = ':><+-=|*'
    value = ''
    while i < len( changes_string ) and allowed_commands.find( changes_string[i] ) == -1 :
        value += changes_string[i]
        i += 1
    return base36decode( value ), i

# This code is used to apply the changes on the previous
# version of the text, by following the commands found
# in the changset.
# Its output its a displayable text which have certain color;
# and the text modified accordingly to changeset. This allows
# caller to display a visual changes to the text and reroll
# the text for the next changeset.

def apply_pad_revision( revision_ID, text ) :
    global revisions
    tokens = revisions[ revision_ID ][ 'changeset' ].partition( '$' )
    
    changes_string = tokens[0]
    original_text = text
    original_text_pos = 0
    revision_text = tokens[2]
    revision_text_pos = 0
    modified_text = ''
    modified_text_pos = 0
    displayed_text = ''
    displayed_text_pos = 0
    i = 1 # 1 instead 0 to bypass leading Z char which is only a magic byte
    while i < len( changes_string ) :
        command = changes_string[i]
        value, i = get_changeset_value( changes_string, i + 1 )
        # print ( "Command : " + command + ' ' + str( value ) )
        if command == ':' or command == '>' or command == '<' :
            # ':' means previous revision text length
            # '>' means number of char added to previous revision text
            # '<' means number of char removed from previous revision text
            # We do not need this information here, so pass it
            pass
        elif command == '+' :
            # '+' means that X characters are to be added from this actual cursor
            # within previous revision text.
            modified_text += revision_text [ revision_text_pos : revision_text_pos + value ]
            modified_text_pos += value
            revision_text_part = revision_text [ revision_text_pos : revision_text_pos + value ]
            if revision_text_part.endswith( '\n' ) :
                revision_text_part = revision_text_part[ 0: len( revision_text_part ) - 1 ] + ' \n'
            displayed_text += '\33[42;1m' + revision_text_part + '\33[0m'
            displayed_text_pos += len( revision_text_part ) + len ( '\33[42;1m' + '\33[0m' )
            revision_text_pos += value
            # original_text_pos is unchanged since it is a new inserted text
        elif command == '-' :
            # modified_text_pos is unchanged
            original_text_part = original_text [ original_text_pos : original_text_pos + value ]
            if original_text_part.endswith( '\n' ) :
                original_text_part = original_text_part[ 0: len( original_text_part ) - 1 ] + ' \n'
            displayed_text += '\33[41;1m' + original_text_part + '\33[0m'
            displayed_text_pos += len( original_text_part ) + len ( '\33[41;1m' + '\33[0m' )
            original_text_pos += value
        elif command == '=' :
            # modified_text += original_text [ original_text_pos : original_text_pos + value ]
            # modified_text_pos += value
            # displayed_text += original_text [ original_text_pos : original_text_pos + value ]
            # displayed_text_pos += value
            original_text_pos += value
        elif command == '|' :
            sub_command = changes_string[i]
            sub_value, i = get_changeset_value( changes_string, i + 1 )
            # print( "Subcommand : " + sub_command + ' ' + str( sub_value ) )
            if sub_command == '+' :
                # '+' means that <sub_value> characters are to be added from this actual cursor
                # within previous revision text and that added text contains <value> newline chars.
                modified_text += revision_text [ revision_text_pos : revision_text_pos + sub_value ]
                modified_text_pos += sub_value
                revision_text_part = revision_text [ revision_text_pos : revision_text_pos + sub_value ]
                if revision_text_part.endswith( '\n' ) :
                    revision_text_part = revision_text_part[ 0: len( revision_text_part ) - 1 ] + ' \n'
                displayed_text += '\33[42;1m' + revision_text_part + '\33[0m'
                displayed_text_pos += len( revision_text_part ) + len ( '\33[42;1m' + '\33[0m' )
                revision_text_pos += sub_value
                # original_text_pos is unchanged
            elif sub_command == '-' :
                # modified_text_pos is unchanged
                original_text_part = original_text [ original_text_pos : original_text_pos + sub_value ]
                if original_text_part.endswith( '\n' ) :
                    original_text_part = original_text_part[ 0: len( original_text_part ) - 1 ] + ' \n'
                displayed_text += '\33[41;1m' + original_text_part + '\33[0m'
                displayed_text_pos += len( original_text_part ) + len ( '\33[41;1m' + '\33[0m' )
                original_text_pos += sub_value
            elif sub_command == '=' :
                # modified_text += original_text [ original_text_pos : original_text_pos + sub_value ]
                # modified_text_pos += sub_value
                # displayed_text += original_text [ original_text_pos : original_text_pos + sub_value ]
                # displayed_text_pos += sub_value
                original_text_pos += sub_value
        elif command == '*' :
            # Applying attributes to following changes. Generally this set 
            # the author of the change, but this in fact the same
            # as the changeset author found in meta entry. So this is strictly
            # ignored here. But this may reflect the font face or style change 
            # (bold, italic, color, etc.)
            pass
    displayed_text += original_text[ original_text_pos : ]
    modified_text += original_text[ original_text_pos : ]
    
    return displayed_text, modified_text

if __name__ == "__main__":

    # handle script parameter
    if len( sys.argv ) != 2 :
        print( "Please give one etherpad file name as argument for this script" )
        quit()

    # open the etherpad (which is indeed a json export of a variable)
    with open( sys.argv[1] ) as json_file :
        document = json.load(json_file)

    # First, extract only revisions from Etherpad data.
    # (In meantime, build the authors list to ease display later)
    revisions = dict()
    authors = dict()
    for entry_ID in document :
        tokens = entry_ID.split( ':' )
        if entry_ID.startswith( 'pad:' ) :
            # For all pad entries, tokens[0] = "pad" and tokens[1] = pad ID.
            # For revisionned pad entries, tokens[2] = "revs" and tokens[3] =
            #   revision sequential number
            # (there might be several pads by design but not in this case)
            if len( tokens ) == 4 and tokens[2] == 'revs' :
                revisions[ tokens[3] ] = document[ entry_ID ]
        if entry_ID.startswith( 'globalAuthor:' ) :
            authors[ tokens[1] ] = document[ entry_ID ][ 'name' ]

    # Then, parsed the revisions sorted by sequential revision number
    # and change the text one step at a time.s
    history = dict()
    text = '' # by default there is no text yet
    for revision_ID in sorted( revisions, key=int ) :
        title = ''
        if revision_ID == '0' :
            title = '0' + revision_ID + ' (original version)'
        else :
            revision_info = revisions[ revision_ID ][ 'meta' ]
            author = authors[ revision_info[ 'author' ] ]
            timestamp = datetime.fromtimestamp( revision_info[ 'timestamp' ] / 1000 ).isoformat()

            displayed_text, text = apply_pad_revision( revision_ID, text )

            title = revision_ID + ',' + author + ',' + timestamp + ',' + displayed_text
            
            print( title )

JChauvin5 · Novembre 21, 2024, 9:16

Merci +++ à vous deux !!!
Je teste cela et vous tiens au courant.
Bien à vous JL

JChauvin5 · Novembre 21, 2024, 9:25

Bjr Gepetto,

Je n’avais pas vu la réponse de juillet… Merci à vous deux. Je teste cela!