HEX
Server: Apache/2.4.52 (Ubuntu)
System: Linux spn-python 5.15.0-89-generic #99-Ubuntu SMP Mon Oct 30 20:42:41 UTC 2023 x86_64
User: arjun (1000)
PHP: 8.1.2-1ubuntu2.20
Disabled: NONE
Upload Files
File: //usr/local/lib/python3.10/dist-packages/tiktoken/__pycache__/_educational.cpython-310.pyc
o

;��g% �@shdZddlmZddlZddlZddlZGdd�d�Z	dd dd�Z	dd!dd�Zd"dd�Z	dd�Z
dS)#zJThis is an educational implementation of the byte pair encoding algorithm.�)�annotationsNc@sZeZdZd!dd�Zd"d#dd�Zd$dd�Zd%dd�Zd&dd�Zed'dd��Z	edd��Z
d S)(�SimpleBytePairEncoding�pat_str�str�mergeable_ranks�dict[bytes, int]�return�NonecCs0||_||_dd�|��D�|_t�|�|_dS)zCreates an Encoding object.cSsi|]\}}||�qS�r
)�.0�token_bytes�tokenr
r
�@/usr/local/lib/python3.10/dist-packages/tiktoken/_educational.py�
<dictcomp>�z3SimpleBytePairEncoding.__init__.<locals>.<dictcomp>N)rr�items�_decoder�regex�compile�_pat)�selfrrr
r
r�__init__
szSimpleBytePairEncoding.__init__�colour�text�	visualise�
str | None�	list[int]cCsB|j�|�}g}|D]}|�d�}t|j||d�}|�|�q
|S)z`Encodes a string into tokens.

        >>> enc.encode("hello world")
        [388, 372]
        �utf-8)r)r�findall�encode�
bpe_encoder�extend)rrr�words�tokens�word�
word_bytes�word_tokensr
r
rrs
zSimpleBytePairEncoding.encoder#�bytescsd��fdd�|D��S)znDecodes a list of tokens into bytes.

        >>> enc.decode_bytes([388, 372])
        b'hello world'
        �c3s�|]}�j|VqdS�N�r�rr
�rr
r�	<genexpr>-s�z6SimpleBytePairEncoding.decode_bytes.<locals>.<genexpr>)�join�rr#r
r,r�decode_bytes'sz#SimpleBytePairEncoding.decode_bytescCs|�|�jddd�S)uDecodes a list of tokens into a string.

        Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace
        the invalid bytes with the replacement character "�".

        >>> enc.decode([388, 372])
        'hello world'
        r�replace��errors)r0�decoder/r
r
rr4/s	zSimpleBytePairEncoding.decode�list[bytes]cs�fdd�|D�S)z�Decodes a list of tokens into a list of bytes.

        Useful for visualising how a string is tokenised.

        >>> enc.decode_tokens_bytes([388, 372])
        [b'hello', b' world']
        csg|]}�j|�qSr
r*r+r,r
r�
<listcomp>Brz>SimpleBytePairEncoding.decode_tokens_bytes.<locals>.<listcomp>r
r/r
r,r�decode_tokens_bytes:sz*SimpleBytePairEncoding.decode_tokens_bytes�
training_data�
vocab_size�intcCst|||d�}t||d�S)z#Train a BPE tokeniser on some data!)�datar9r�rr)�	bpe_trainr)r8r9rrr
r
r�trainDszSimpleBytePairEncoding.traincCs$t|t�r
t�|�}t|j|jd�S)Nr<)�
isinstancer�tiktoken�get_encodingr�_pat_str�_mergeable_ranks)�encodingr
r
r�
from_tiktokenJs


�z$SimpleBytePairEncoding.from_tiktokenN)rrrrrr	�r)rrrrrr)r#rrr')r#rrr)r#rrr5)r8rr9r:rr)�__name__�
__module__�__qualname__rrr0r4r7�staticmethodr>rEr
r
r
rrs





rrrr�inputr'rrrrc
sdd�|D�}	|r|dvrt|�n|dkrt|�d}d}tt|dd�|dd���D]\}}��|d|d�}|durM|dusI||krM|}|}q.|durSn |dusYJ�|d|�||||dg||d	d�}q|rxt��fd
d�|D�}	|	S)NcS�g|]}t|g��qSr
�r'�r�br
r
rr6Vrzbpe_encode.<locals>.<listcomp>T�r�color�simple����r�csg|]}�|�qSr
r
)r�part�rr
rr6ss)�visualise_tokens�print�	enumerate�zip�get)
rrKr�parts�min_idx�min_rank�i�pair�rankr#r
rWrr Ss0
&�2�r r;rr9r:rcs|dkrtd��i}td�D]	}||t|g�<qdd�t�||�D�}t|�|k�rt���|D]}t|dd�|dd��D]
}�|d7<q?q0t	��fdd�d	�}	|	d
|	d}
t|�}|||
<g}|D]K}
g}d
}|t|
�dkr�|
||
|df|	kr�|�
|
�|d7}n|�
|
|�|d7}|t|
�dksw|t|
�dkr�|�
|
|�|�
|�qi|}|�rtd|	d
�d
|	d���td|
�dt|��d��|dvr�td�tdd�|dd�D��n|dk�rtd�|dd�D]}
t|
�q�td�t|�|ks*|S)N�z;vocab_size must be at least 256, so we can encode all bytescSs g|]}dd�|�d�D��qS)cSrLr
rMrNr
r
rr6�rz(bpe_train.<locals>.<listcomp>.<listcomp>r)r)rr$r
r
rr6�s�zbpe_train.<locals>.<listcomp>rSrTcs�|Sr)r
)�x��statsr
r�<lambda>�szbpe_train.<locals>.<lambda>)�keyrrUz The current most common pair is z + zSo we made z our zth tokenrPz9Now the first fifty words in our training data look like:cSsg|]	}|D]}|�qqSr
r
)rr$r
r
r
rr6���2rRz:Now the first twenty words in our training data look like:��
)
�
ValueError�ranger'rr�len�collections�Counterr[�max�appendrYrX)r;r9rr�ranksr`r"�piecera�most_common_pairrr
�	new_wordsr$�new_wordr
rerr=ws\
��

�

�-r=�token_valuesr5r	cCs�dd�dD�}dd�|D�}d}d}|D].}||t|�}||kr2||dt|�}||ks2J�|}|t|�7}t||dd�qtd	�dS)
NcSsg|]}d|�d��qS)z[48;5;�mr
)rr`r
r
rr6�sz$visualise_tokens.<locals>.<listcomp>)����M�P�D�cSsg|]	}|jddd��qS)rr1r2)r4)rrdr
r
rr6�rirrT�)�endz)rorY)ry�
background�unicode_token_values�running_length�
last_colorr
rQr
r
rrX�srXcCs�d}tt��}|��}Wd�n1swYtj|d|d�}td�|�d�}|�|�dks4J�|�|�dks=J�|�	|�ddgksHJ�|S)	NzN's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+iX)r9rzJThis is the sequence of merges performed in order to encode 'hello world':zhello worldshello worldshellos world)
�open�__file__�readrr>rYrr4r0r7)�gpt2_pattern�fr;�encr#r
r
r�train_simple_encoding�s�

�
r�rF)rrrKr'rrrr)
r;rr9r:rrrrrr)ryr5rr	)�__doc__�
__future__rrprr@rr r=rXr�r
r
r
r�<module>sH�%�
E