
    bio7                         d dl Z d dlmZmZ d dlZd dlZd dlmZ ddl	m
Z
  e
       rd dlmZ d Z G d d	      Z G d
 d      Zy)    N)DictList)Image   )is_torchvision_available)
transformsc                    t        | j                   d|z  k\  rT| j                  t        d | j                  D              t        j
                        } t        | j                   d|z  k\  rTt        | j                   |kD  rR|t        | j                   z  | j                  t        fd| j                  D              t        j                        } t        | j                   dk  rRdt        | j                   z  | j                  t        fd| j                  D              t        j                        } t        j                  |       }|j                  d   dz  dz  }|j                  d   dz  |z
  }|j                  d   dz  dz  }|j                  d   dz  |z
  }|||j                  d   |z
  ||j                  d   |z
  f   }t	        j                  |      S )	z
    Crop the image so that its height and width does not exceed `max_image_size`, while ensuring both the height and
    width are multiples of 16.
       c              3   &   K   | ]	  }|d z    yw)r
   N ).0xs     h/home/cdr/jupyterlab/.venv/lib/python3.12/site-packages/diffusers/pipelines/omnigen/processor_omnigen.py	<genexpr>zcrop_image.<locals>.<genexpr>#   s     *Ja16*Js   )resamplec              3   :   K   | ]  }t        |z          y wNroundr   r   scales     r   r   zcrop_image.<locals>.<genexpr>'        *T5U+;*T      c              3   :   K   | ]  }t        |z          y wr   r   r   s     r   r   zcrop_image.<locals>.<genexpr>+   r   r   r      )minsizeresizetupler   BOXmaxBICUBICnparrayshape	fromarray)	pil_imagemax_image_sizearrcrop_y1crop_y2crop_x1crop_x2r   s          @r   
crop_imager/      s   
 y~~
!n"4
4$$U*J9>>*J%JUZU^U^$_	 y~~
!n"4
4 INNn,inn!55$$U*TY^^*T%T_d_l_l$m	
INNb S)..))$$U*TY^^*T%T_d_l_l$m	
((9
Cyy|b Q&GiilR')Gyy|b Q&GiilR')G
g		!w..#))A,:P0PP
QC??3    c                       e Zd ZddefdZd Zd Zd Zd Z	 	 	 	 	 	 	 	 dde	e
   d	e	e	e
      d
edede
dededededefdZy)OmniGenMultiModalProcessorr)   c           	          || _         | _        t        j                  t        j                  fd      t        j
                         t        j                  g dg dd      g      | _        t               | _	        y )Nc                     t        |       S r   r/   r(   r)   s    r   <lambda>z5OmniGenMultiModalProcessor.__init__.<locals>.<lambda>?       Jy.4Y r0         ?r:   r:   Tmeanstdinplace)
text_tokenizerr)   r   ComposeLambdaToTensor	Normalizeimage_transformOmniGenCollatorcollator)selfr?   r)   s     `r   __init__z#OmniGenMultiModalProcessor.__init__9   se    ,,)11!!"YZ##%$$/X\] 
 ()r0   c           	          | _         t        j                  t        j                  fd      t        j                         t        j
                  g dg dd      g      | _        y )Nc                     t        |       S r   r5   r6   s    r   r7   zAOmniGenMultiModalProcessor.reset_max_image_size.<locals>.<lambda>K   r8   r0   r9   Tr;   )r)   r   r@   rA   rB   rC   rD   )rG   r)   s    `r   reset_max_image_sizez/OmniGenMultiModalProcessor.reset_max_image_sizeG   sQ    ,)11!!"YZ##%$$/X\] 
r0   c                     t        |t              r$t        j                  |      j	                  d      }| j                  |      S )NRGB)
isinstancestrr   openconvertrD   )rG   images     r   process_imagez(OmniGenMultiModalProcessor.process_imageQ   s6    eS!JJu%--e4E##E**r0   c           	         | j                  |      }|t        |      dk(  r!| j                  |      }|j                  d d dS d}t	        j
                  ||      D cg c]  }| j                  |      j                   }}t        dt        |            D ]  }||   d   dk(  s||   dd  ||<    t	        j                  ||      }|D 	cg c]1  }	t        |	j                  d      d   j                  d      d         3 }
}	t        t        |
            }|t        t        dt        |      dz               k(  s
J d|        t        |      t        |      k(  s J d	t        |       d
t        |       d       |
D cg c]
  }||dz
      }}g }g }t        t        |            D ]  }|j                  ||          |t        |      dz
  k7  s)t        |      }||   j                  d      ||   j                  d      z  dz  dz  }|j                  |||z   g       |j                  dg|z          |||dS c c}w c c}	w c c}w )Nr   )	input_idspixel_valuesimage_sizesz<\|image_\d+\|>r   |_zSimage_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be z?total images must be the same as the number of image tags, got z image tags and z imagesr   )add_prefix_instructionlenr?   rU   resplitrangefindallintsortedsetlistextendr   append)rG   textinput_imagesmodel_inputspatternchunkprompt_chunksi
image_tagss	image_idsunique_image_idsr   all_input_idsimg_inx	start_inxr   s                    r   process_multi_modal_promptz5OmniGenMultiModalProcessor.process_multi_modal_promptV   s   **403|#4#9..t4L!-!7!7^bcc$KM88T[]aKbc%,,U3==ccq#m,- 	8AQ"a'#0#3AB#7a 	8 ZZ.
BLMQSa..s3B78M	M!#i.14a5E1F1J(K#LL 	
abrast	
L #$L(99 	
McRbNcMddtux  zF  vG  uH  HO  P	
9 6??QU+??s=)* 	1A  q!12C&**.	#A++B/,q/2F2Fr2JJbPTVV	9t+;<=$$aS4Z0	1 +LY`aa= d N @s   "I 6II
c                 .    d}d}d}d}| | | | | }|S )Nz	<|user|>
z:Generate an image according to the following instructions
z<|assistant|>
<|diffusion|>z<|end|>
r   )rG   promptuser_promptgeneration_promptassistant_promptprompt_suffixs         r   r\   z1OmniGenMultiModalProcessor.add_prefix_instruction}   s;    "Y9#=!2 3F8M?K[J\]r0   Ninstructionsri   heightwidthnegative_promptuse_img_cfgseparate_cfg_inputuse_input_image_size_as_outputnum_images_per_promptreturnc
                    t        |t              r|g}|g}g }
t        t        |            D ]:  }||   }|d n||   }|-t        |      dkD  r|D cg c]  }| j	                  |       }}nd }d|vsJ | j                  ||      }d\  }}| j                  |d       }|r\|Xt        |      dk\  rJt        t        |            D cg c]  }d|dz    d }}| j                  dj                  |      |      }n|}t        |	      D ]]  }|rB|
j                  ||||d   d   j                  d	      |d   d   j                  d
      gf       G|
j                  |||||gf       _ = | j                  |
      S c c}w c c}w )Nr   z<img><|image_1|></img>)NNr   z<img><|image_z|></img> rV   r[   rZ   )
rN   rO   r`   r]   rS   rv   joinrg   r   rF   )rG   r}   ri   r~   r   r   r   r   r   r   
input_datarn   cur_instructioncur_input_imagesr   
mllm_inputneg_mllm_inputimg_cfg_mllm_inputimg_cfg_promptrY   s                       r   __call__z#OmniGenMultiModalProcessor.__call__   s    lC((>L(>L
s<() 	iA*1oO'3';ta+4D0E0ICS#TaD$6$6q$9#T #T#' /FFF88JZ[J1;.N.!<<_dSN#/C8H4IQ4NOTUXYiUjOk%l!a!eWH&E%lN%l)-)H)HR`Iacs)t&)7&01 i1%%&*.'7:??CZP^E_`aEbEgEghjEkl	 %%z>CUX^`eWf&ghi)	iB }}Z((; $U &ms   E<F)   )Nr   r   aj  low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers.TFFr   )__name__
__module____qualname__rb   rH   rK   rS   rv   r\   r   rO   boolr   r   r   r0   r   r2   r2   8   s    *s *
+
%bN )-  L #(/4%&2)3i2) 49o2) 	2)
 2) 2) 2) !2) )-2)  #2) 
2)r0   r2   c                   8    e Zd Zd	dZd Zd Zd Zd Zd Zd Z	y)
rE   c                      || _         || _        y r   )pad_token_idhidden_size)rG   r   r   s      r   rH   zOmniGenCollator.__init__   s    (&r0   c                 
   g }|j                  d      }t        |      }|D ]L  }t        j                  |      }dg||z
  z  t	        t        ||z   dz               z   }|j                  |       N t        j                  |      S )NrZ   r   r   )r   r"   torchsumre   r`   rg   
LongTensor)	rG   attention_masknum_tokens_for_output_imagesposition_idstext_length
img_lengthmasktemp_ltemp_positions	            r   create_positionzOmniGenCollator.create_position   s    $))"-56
" 	/DYYt_FC;#784fz)A-.< M .	/ --r0   c                    g }g }|j                  d      }t        |      }||z   dz   }d}|D ]  }	t        j                  |	      }
||
z
  }t        j                  t        j
                  |
dz   |
dz   f            }t        j                  |
dz   |f      }t        j                  ||gd      }t        j
                  ||
|z   dz   f      }t        j                  ||gd      }|dkD  rht        j                  |
dz   |z   |f      }t        j                  ||gd      }t        j
                  ||f      }t        j                  ||gd      }||   }||z
  }|dkD  r0d|dd| df<   t        j                  d|| j                  f      }nd}|j                  |j                  d             |j                  |       |dz  } t        j                  |d      |fS )z
        OmniGen applies causal attention to each element in the sequence, but applies bidirectional attention within
        each image sequence References: [OmniGen](https://huggingface.co/papers/2409.11340)
        rZ   r   r   )r   )dimN)r   r"   r   r   triloneszeroscatr   rg   	unsqueeze)rG   r   r   extended_maskpadding_imagesr   r   seq_leninxr   r   pad_l	temp_mask
image_maskpad_masktrue_img_lengthpad_img_lengthtemp_padding_imgss                     r   create_maskzOmniGenCollator.create_mask   s   
 $))"-56

*Q." 	DYYt_F&(E

5::FQJ
3K#LMI6A:z*BCJ		9j"9rBI*fz6IA6M)NOJ		9j"9qAIqy ;;VaZ*-De,LM!IIx&;D	 ::E7+;<!IIx&;C	:3?O'/9N!12	!n_--.$)KKaIYIY5Z$[!$(!  !4!4Q!78!!"341HC;	< yyA.>>r0   c                 f    |j                         D ]  }||   D ]  \  }}d||   ||||f<     |S )Nr   )keys)rG   r   rW   b_inxru   end_inxs         r   !adjust_attention_for_input_imagesz1OmniGenCollator.adjust_attention_for_input_images   s]     %%' 	PE&1%&8 P"	7NOu%i&779J&JKP	P r0   c           	      X   t        |D cg c]  }t        |       c}      }g }g }t        t        |            D ]  }||   }t        |      }	||	z
  }
|
dk(  r'|j                  dg|z         |j                  |       n>|j                  dg|
z  dg|	z  z          |j                  | j                  g|
z  |z          ||v sg }||   D ]$  }|j                  |D cg c]  }||
z   	 c}       & |||<    t        j                  |      t        j                  |      |fS c c}w c c}w )Nr   r   )r"   r]   r`   rg   r   r   r   )rG   rU   rW   r   max_l
padded_idsr   rn   temp_idsr   r   new_inxold_inxs                r   pad_input_idszOmniGenCollator.pad_input_ids   s7   Y/SV/0
s9~& 	)A |H]FFNEz%%qcEk2!!(+%%qcEkQC&L&@A!!4#4#4"5"="HIK*1~ AGNNw#?!AI#?@A!(A	)" 
+U-=-=n-M{ZZ+ 0$ $@s   D"D'c                    g }|D ]"  }|j                  |d   |d   z  dz  dz         $ g i }}d}|D ]I  }|d   =|j                  |d          |d   D ]!  }	||vr|	g||<   ||   j                  |	       # |dz  }K |D cg c]  }|j                  d       }}|D cg c]  }|d   	 }
}| j                  |
|      \  }}}| j	                  ||      }| j                  ||      \  }}| j                  ||      }||||||fS c c}w c c}w )Nr   r   r   rV   rW   rU   )rg   rf   r   r   r   r   r   )rG   mllm_inputstarget_img_sizer   img_sizerV   rW   r   r   r   rU   padded_input_idsr   r   r   s                  r   process_mllm_inputz"OmniGenCollator.process_mllm_input  sk   ')$' 	WH(//hqk0IR0OSU0UV	W %'k 	A ,##An$56m, 8DK/.2VE*#E*11$7	8
 QJE	 1==1A==-89Q{^9	98<8J8J9Va8b5.+++N<XY)-)9)9.Jf)g&??P[\~~|]hhh >9s   D D
c                 @   |D cg c]  }|d   	 }}|D cg c]  }|d   	 }}|D cg c]  }|d   	 }}|D cg c]  }|d   	 }}|d   ||z   |z   }||z   |z   }n
||z   }||z   }| j                  ||      \  }}}	}
}}||	|||d}|S c c}w c c}w c c}w c c}w )Nr   r   r
   r   )rU   r   r   input_pixel_valuesinput_image_sizes)r   )rG   featuresfr   cfg_mllm_inputsr   r   all_padded_input_idsall_position_idsall_attention_maskall_padding_imagesall_pixel_valuesall_image_sizesdatas                 r   r   zOmniGenCollator.__call__/  s    %-.qt..)12A1Q422,45qad55)12A1Q422a ,%7:LLK-?/QO%7K-?O ##KA	
  .0,"2!0
 9 /252s   BBBBN)r
   i   )
r   r   r   rH   r   r   r   r   r   r   r   r0   r   rE   rE      s(    '
.)?V[0i4r0   rE   )r^   typingr   r   numpyr$   r   PILr   utilsr   torchvisionr   r/   r2   rE   r   r0   r   <module>r      sB    
     - & 6) )DR Rr0   